Preprocessing bugfix

644258c2 · Jaime Collado · 3b604dad · 644258c2 · 644258c2
Commit 644258c2 authored Mar 21, 2023 by Jaime Collado
Showing with 13 additions and 13 deletions
app.py
utils.py
--- a/app.py
+++ b/app.py
@@ -36,7 +36,7 @@ with open("./classifiers/pirads_model.joblib", "rb") as pickled_file:
 def _predict(text, model, vectorizer):
    # Preprocess input text
    clean_text = utils.preprocessing_text(text)
-    clean_text = utils.clear_birads(clean_text)
+    clean_text = utils.clear_pirads(clean_text)
    # Vectorize text
    X_test = vectorizer.transform([clean_text])

--- a/utils.py
+++ b/utils.py
@@ -6,7 +6,7 @@ def preprocessing_text(s):
    s = re.sub('\t+', ' ', s)
    # Unicode normalization
-    s = re.sub(r'BR', r'birads', s)
+    s = re.sub(r'PR', r'pirads', s)
    # string to lower
    s = s.strip().lower()
@@ -14,9 +14,9 @@ def preprocessing_text(s):
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    # replace synonyms of birads
-    synon = ['bi rads', 'bi-rads', 'b-rads', 'birads-', 'birads -', 'bi_rads', 'birads/']
+    synon = ['pi rads', 'pi-rads', 'p-rads', 'pirads-', 'pirads -', 'pi_rads', 'pirads/']
    for sy in synon:
-        s = re.sub(sy, r'birads ', s)
+        s = re.sub(sy, r'pirads ', s)
    s = re.sub(' +', ' ', s)
@@ -32,14 +32,14 @@ def preprocessing_text(s):
    }
    for key, value in dic_roman.items():
-        start = 'birads ' + key
+        start = 'pirads ' + key
-        end = 'birads ' + value
+        end = 'pirads ' + value
        s = re.sub(start, end, s)
    s = re.sub(' +', ' ', s)
-    s = re.sub(r'birads (\d)([a-z])', r'birads \1 \2', s)
+    s = re.sub(r'pirads (\d)([a-z])', r'pirads \1 \2', s)
-    s = re.sub(r'birads (\d) - (\d)', r'birads \1 birads \2', s)
+    s = re.sub(r'pirads (\d) - (\d)', r'pirads \1 pirads \2', s)
    s = re.sub(' +', ' ', s)
@@ -53,8 +53,8 @@ def preprocessing_text(s):
    s = re.sub(' +', ' ', s)
-    # s = re.sub(r' b'+str(i)+' ', r' birads '+str(i)+' ', s)
+    # s = re.sub(r' b'+str(i)+' ', r' pirads '+str(i)+' ', s)
-    # s = re.sub(r' b '+str(i)+' ', r' birads '+str(i)+' ', s)
+    # s = re.sub(r' b '+str(i)+' ', r' pirads '+str(i)+' ', s)
    # replace separate numbers e.g.: 4 x 5 . 9 by 4x5.9
    for n in [',', 'x', '.']:
@@ -65,9 +65,9 @@ def preprocessing_text(s):
-def clear_birads(text):
+def clear_pirads(text):
-    text = re.sub(r'birads.{1,3}\d{1}[a|b|c]?', '', text)
+    text = re.sub(r'pirads.{1,3}\d{1}[a|b|c]?', '', text)
-    text = re.sub(r'birads categoria \d{1}', '', text)
+    text = re.sub(r'pirads categoria \d{1}', '', text)
    text = text.replace("(  )", "")
    text = re.sub('\t+', ' ', text)
    text = re.sub(' +', ' ', text)