Fixing some bugs

5fdda07f · Estrella Vallecillo · e65ff7ab · 5fdda07f · 5fdda07f · 5fdda07f
Commit 5fdda07f authored May 30, 2022 by Estrella Vallecillo
Showing with 23 additions and 8 deletions
Examples/AnalyzeADataframe.ipynb
textflow/EmotionAnalyzer.py
textflow/PolarityAnalyzer.py
textflow/StylometryAnalyzer.py
--- a/Examples/AnalyzeADataframe.ipynb
+++ b/Examples/AnalyzeADataframe.ipynb
--- a/textflow/EmotionAnalyzer.py
+++ b/textflow/EmotionAnalyzer.py
@@ -14,7 +14,7 @@ class EmotionAnalyzer(Analyzer):
        polarityClassifier: a pipeline that uses a model for inference the emotions of the text of a sequence.
    """

-    def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True):
+    def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True, maxEmbedding = 130):
        """
        Create a emotions analyzer.

@@ -22,8 +22,10 @@ class EmotionAnalyzer(Analyzer):
            task: the task defining which pipeline will be returned.
            model: the model that will be used by the pipeline to make predictions.
            allScores: True, if we want that the classifier returns all scores. False, in other case.
+            maxEmbedding: The number of max_position_embedings in the config.json of the model selected.
        """
        self.emotionsClassifier = pipeline(task,model=modelEmotions, return_all_scores=allScores)
+        self.maxEmbedding = maxEmbedding


    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): 
@@ -51,7 +53,7 @@ class EmotionAnalyzer(Analyzer):
        """
        arrayResults =[]
        for text in arrayText:
-            prediction = self.emotionsClassifier(text)
+            prediction = self.emotionsClassifier(text[:self.maxEmbedding])
            #arrayResults.append(prediction[0][0])
            arrayResults.append(prediction)
        return arrayResults

--- a/textflow/PolarityAnalyzer.py
+++ b/textflow/PolarityAnalyzer.py
@@ -11,7 +11,7 @@ class PolarityAnalyzer(Analyzer):
        polarityClassifier: a pipeline that uses a model for inference the polarity of the text of a sequence.
    """

-    def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True):
+    def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True, maxEmbedding = 512):
        """
        Create a polarity analyzer.

@@ -19,8 +19,10 @@ class PolarityAnalyzer(Analyzer):
            task: the task defining which pipeline will be returned
            model: the model that will be used by the pipeline to make predictions
            allScores: True, if we want that the classifier returns all scores. False, in other case
+            maxEmbedding: The number of max_position_embedings in the config.json of the model selected.
        """
        self.polarityClassifier = pipeline(task,model= modelPolarity, return_all_scores=allScores)
+        self.maxEmbeding = maxEmbedding
        

    
@@ -48,7 +50,7 @@ class PolarityAnalyzer(Analyzer):
        """
        arrayResults =[]
        for text in arrayText:
-            prediction = self.polarityClassifier(text)
+            prediction = self.polarityClassifier(text[:self.maxEmbeding])
            #arrayResults.append(prediction[0][0])
            arrayResults.append(prediction)
        return arrayResults

--- a/textflow/StylometryAnalyzer.py
+++ b/textflow/StylometryAnalyzer.py
@@ -69,7 +69,7 @@ class StylometryAnalyzer(Analyzer):
        resultsList = []
        for t in arrayText:
            t.lower()
-            tokens = self.tokenizer.tokenize (t)
+            tokens = self.tokenizer.tokenize(t)
            text= [token.lower() for token in tokens]
            self.freqWords(text,self.stopwords,self.puntuation)
            self.funcionesTTR(text)
@@ -100,9 +100,20 @@ class StylometryAnalyzer(Analyzer):
        self.numWordFreqOne = len( [token[0] for token in self.freqWord if token[1] == 1 ])
        self.TTR = len(self.uniqueWords) / len(text)
        self.RTTR = len(self.uniqueWords) / math.sqrt(len(text))
-        self.herdan = math.log(len(self.uniqueWords),10) / math.log(len(text),10)
-        self.mass = (math.log(len(text),10)- math.log(len(self.uniqueWords),10)) /  pow(math.log(len(self.uniqueWords),10),2)
-        self.somers = math.log(math.log(len(self.uniqueWords),10),10) / math.log(math.log(len(text),10),10)
+        if len(text)== 1:
+            self.herdan = math.log(len(self.uniqueWords),10)
+        else:
+            self.herdan = math.log(len(self.uniqueWords),10) / math.log(len(text),10)
+        if pow(math.log(len(self.uniqueWords),10),2) == 0:
+            self.mass = (math.log(len(text),10)- math.log(len(self.uniqueWords),10))
+        else:
+            self.mass = (math.log(len(text),10)- math.log(len(self.uniqueWords),10)) /  pow(math.log(len(self.uniqueWords),10),2)
+        if len(text) == 10:
+            self.somers = math.log(math.log(len(self.uniqueWords),10),10)
+        elif len(self.uniqueWords) == 10 or len(self.uniqueWords) == 1:
+            self.somers = 0
+        else:
+            self.somers = math.log(math.log(len(self.uniqueWords),10),10) / math.log(math.log(len(text),10),10)
        if math.log(len(text),10)- math.log(len(self.uniqueWords),10) == 0:
            self.dugast = pow(math.log(len(text),10),2)
        else: