Commit 644258c2 by Jaime Collado

Preprocessing bugfix

parent 3b604dad
Showing with 13 additions and 13 deletions
......@@ -36,7 +36,7 @@ with open("./classifiers/pirads_model.joblib", "rb") as pickled_file:
def _predict(text, model, vectorizer):
# Preprocess input text
clean_text = utils.preprocessing_text(text)
clean_text = utils.clear_birads(clean_text)
clean_text = utils.clear_pirads(clean_text)
# Vectorize text
X_test = vectorizer.transform([clean_text])
......
......@@ -6,7 +6,7 @@ def preprocessing_text(s):
s = re.sub('\t+', ' ', s)
# Unicode normalization
s = re.sub(r'BR', r'birads', s)
s = re.sub(r'PR', r'pirads', s)
# string to lower
s = s.strip().lower()
......@@ -14,9 +14,9 @@ def preprocessing_text(s):
s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
# replace synonyms of birads
synon = ['bi rads', 'bi-rads', 'b-rads', 'birads-', 'birads -', 'bi_rads', 'birads/']
synon = ['pi rads', 'pi-rads', 'p-rads', 'pirads-', 'pirads -', 'pi_rads', 'pirads/']
for sy in synon:
s = re.sub(sy, r'birads ', s)
s = re.sub(sy, r'pirads ', s)
s = re.sub(' +', ' ', s)
......@@ -32,14 +32,14 @@ def preprocessing_text(s):
}
for key, value in dic_roman.items():
start = 'birads ' + key
end = 'birads ' + value
start = 'pirads ' + key
end = 'pirads ' + value
s = re.sub(start, end, s)
s = re.sub(' +', ' ', s)
s = re.sub(r'birads (\d)([a-z])', r'birads \1 \2', s)
s = re.sub(r'birads (\d) - (\d)', r'birads \1 birads \2', s)
s = re.sub(r'pirads (\d)([a-z])', r'pirads \1 \2', s)
s = re.sub(r'pirads (\d) - (\d)', r'pirads \1 pirads \2', s)
s = re.sub(' +', ' ', s)
......@@ -53,8 +53,8 @@ def preprocessing_text(s):
s = re.sub(' +', ' ', s)
# s = re.sub(r' b'+str(i)+' ', r' birads '+str(i)+' ', s)
# s = re.sub(r' b '+str(i)+' ', r' birads '+str(i)+' ', s)
# s = re.sub(r' b'+str(i)+' ', r' pirads '+str(i)+' ', s)
# s = re.sub(r' b '+str(i)+' ', r' pirads '+str(i)+' ', s)
# replace separate numbers e.g.: 4 x 5 . 9 by 4x5.9
for n in [',', 'x', '.']:
......@@ -65,9 +65,9 @@ def preprocessing_text(s):
def clear_birads(text):
text = re.sub(r'birads.{1,3}\d{1}[a|b|c]?', '', text)
text = re.sub(r'birads categoria \d{1}', '', text)
def clear_pirads(text):
text = re.sub(r'pirads.{1,3}\d{1}[a|b|c]?', '', text)
text = re.sub(r'pirads categoria \d{1}', '', text)
text = text.replace("( )", "")
text = re.sub('\t+', ' ', text)
text = re.sub(' +', ' ', text)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment