Commit 644258c2 by Jaime Collado

Preprocessing bugfix

parent 3b604dad
Showing with 13 additions and 13 deletions
...@@ -36,7 +36,7 @@ with open("./classifiers/pirads_model.joblib", "rb") as pickled_file: ...@@ -36,7 +36,7 @@ with open("./classifiers/pirads_model.joblib", "rb") as pickled_file:
def _predict(text, model, vectorizer): def _predict(text, model, vectorizer):
# Preprocess input text # Preprocess input text
clean_text = utils.preprocessing_text(text) clean_text = utils.preprocessing_text(text)
clean_text = utils.clear_birads(clean_text) clean_text = utils.clear_pirads(clean_text)
# Vectorize text # Vectorize text
X_test = vectorizer.transform([clean_text]) X_test = vectorizer.transform([clean_text])
......
...@@ -6,7 +6,7 @@ def preprocessing_text(s): ...@@ -6,7 +6,7 @@ def preprocessing_text(s):
s = re.sub('\t+', ' ', s) s = re.sub('\t+', ' ', s)
# Unicode normalization # Unicode normalization
s = re.sub(r'BR', r'birads', s) s = re.sub(r'PR', r'pirads', s)
# string to lower # string to lower
s = s.strip().lower() s = s.strip().lower()
...@@ -14,9 +14,9 @@ def preprocessing_text(s): ...@@ -14,9 +14,9 @@ def preprocessing_text(s):
s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
# replace synonyms of birads # replace synonyms of birads
synon = ['bi rads', 'bi-rads', 'b-rads', 'birads-', 'birads -', 'bi_rads', 'birads/'] synon = ['pi rads', 'pi-rads', 'p-rads', 'pirads-', 'pirads -', 'pi_rads', 'pirads/']
for sy in synon: for sy in synon:
s = re.sub(sy, r'birads ', s) s = re.sub(sy, r'pirads ', s)
s = re.sub(' +', ' ', s) s = re.sub(' +', ' ', s)
...@@ -32,14 +32,14 @@ def preprocessing_text(s): ...@@ -32,14 +32,14 @@ def preprocessing_text(s):
} }
for key, value in dic_roman.items(): for key, value in dic_roman.items():
start = 'birads ' + key start = 'pirads ' + key
end = 'birads ' + value end = 'pirads ' + value
s = re.sub(start, end, s) s = re.sub(start, end, s)
s = re.sub(' +', ' ', s) s = re.sub(' +', ' ', s)
s = re.sub(r'birads (\d)([a-z])', r'birads \1 \2', s) s = re.sub(r'pirads (\d)([a-z])', r'pirads \1 \2', s)
s = re.sub(r'birads (\d) - (\d)', r'birads \1 birads \2', s) s = re.sub(r'pirads (\d) - (\d)', r'pirads \1 pirads \2', s)
s = re.sub(' +', ' ', s) s = re.sub(' +', ' ', s)
...@@ -53,8 +53,8 @@ def preprocessing_text(s): ...@@ -53,8 +53,8 @@ def preprocessing_text(s):
s = re.sub(' +', ' ', s) s = re.sub(' +', ' ', s)
# s = re.sub(r' b'+str(i)+' ', r' birads '+str(i)+' ', s) # s = re.sub(r' b'+str(i)+' ', r' pirads '+str(i)+' ', s)
# s = re.sub(r' b '+str(i)+' ', r' birads '+str(i)+' ', s) # s = re.sub(r' b '+str(i)+' ', r' pirads '+str(i)+' ', s)
# replace separate numbers e.g.: 4 x 5 . 9 by 4x5.9 # replace separate numbers e.g.: 4 x 5 . 9 by 4x5.9
for n in [',', 'x', '.']: for n in [',', 'x', '.']:
...@@ -65,9 +65,9 @@ def preprocessing_text(s): ...@@ -65,9 +65,9 @@ def preprocessing_text(s):
def clear_birads(text): def clear_pirads(text):
text = re.sub(r'birads.{1,3}\d{1}[a|b|c]?', '', text) text = re.sub(r'pirads.{1,3}\d{1}[a|b|c]?', '', text)
text = re.sub(r'birads categoria \d{1}', '', text) text = re.sub(r'pirads categoria \d{1}', '', text)
text = text.replace("( )", "") text = text.replace("( )", "")
text = re.sub('\t+', ' ', text) text = re.sub('\t+', ' ', text)
text = re.sub(' +', ' ', text) text = re.sub(' +', ' ', text)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment