Implementing some analyzers

e9f93ab1 · Estrella Vallecillo · f3e06440 · e9f93ab1 · e9f93ab1 · e9f93ab1
Commit e9f93ab1 authored May 16, 2022 by Estrella Vallecillo
Showing with 361 additions and 58 deletions
poetry.lock
pyproject.toml
textflow/Analyzer.py
textflow/ComplexityAnalyzer.py
textflow/EmotionAnalyzer.py
textflow/LemmaAnalyzer.py
textflow/POSAnalyzer.py
textflow/PolarityAnalyzer.py
textflow/Sequence.py
textflow/StylometryAnalyzer.py
textflow/VolumetryAnalyzer.py
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,14 +5,16 @@ description = "A text analysis library for Python"
 authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@red.ujaen.es>"]
 [tool.poetry.dependencies]
-python = "^3.8"
+python = "3.8"
 nltk = "^3.7"
 spacy = "^3.3.0"
-transformers = "^4.18.0"
+transformers = "^4.19.0"
+torch = {version = "^1.11.0", python = "^3.7", platform = "linux"}
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
--- a/textflow/Analyzer.py
+++ b/textflow/Analyzer.py
--- a/textflow/ComplexityAnalyzer.py
+++ b/textflow/ComplexityAnalyzer.py
@@ -7,11 +7,13 @@ import re
 import numpy as np
 import math
 from functools import reduce
+from textflow.Analyzer import Analyzer
 creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt')
-class ComplexityAnalyzer:
+class ComplexityAnalyzer(Analyzer):
-    def __init__(self, lang = "es"):
+    def __init__(self, rutaArchivoCrea = creaPath,lang = "es"):
        """Creates an analyzer from an input object.
        Args:
@@ -22,11 +24,7 @@ class ComplexityAnalyzer:
            spacy.cli.download("es_core_news_sm")
            self.nlp = spacy.load("es_core_news_sm")
            #Vamos a cargar CREA:
-            self.dicFreqWords=self.read(creaPath)
+            self.dicFreqWords=self.read(rutaArchivoCrea)
-            self.function = self.complexity
-        '''elif lang == "en":
-            spacy.cli.download("en_core_web_sm")
-            self.nlp = spacy.load("en_core_web_sm")'''
    #Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
@@ -42,7 +40,8 @@ class ComplexityAnalyzer:
        Raises:
            ValueError if the levelOfResult is incorrect
        """
-        if levelOfResult == "":
+        super().analyze(self.complexity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
+        '''if levelOfResult == "":
            analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO
            resultOfAnalisys= []
            for i in analyzeResult:
@@ -64,7 +63,7 @@ class ComplexityAnalyzer:
                        else:
                            children = [c.children for c in child[r]]
                    else:
-                        raise ValueError(f"Sequence level '{r}' not found in {child}") 
+                        raise ValueError(f"Sequence level '{r}' not found in {child}") '''
    def read(self,fichero):

--- a/textflow/EmotionAnalyzer.py
+++ b/textflow/EmotionAnalyzer.py
+import os
+import spacy
+import spacy.cli
+from typing import Optional
+from textflow.Analyzer import Analyzer
+from transformers import pipeline
+import torch
+class EmotionAnalyzer(Analyzer):
+    def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True):
+        """Creates an analyzer from an input object.
+        Args:
+            function: the function of the analyzer like count word, files...
+            isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
+        """
+        self.emotionsClassifier = pipeline(task,model=modelEmotions, return_all_scores=allScores)
+    #Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
+    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
+        """Analyze a sequence
+        Args:
+            sequence: the Sequence we want to analyze
+            tag: the label to store the analysis resut
+            levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
+            levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
+            analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
+        Raises:
+            ValueError if the levelOfResult is incorrect
+        """
+        super().analyze(self.emotions,sequence, tag, levelOfAnalyzer, levelOfResult, True) 
+    def emotions(self, arrayText):
+        arrayResults =[]
+        for text in arrayText:
+            prediction = self.emotionsClassifier(text)
+            #arrayResults.append(prediction[0][0])
+            arrayResults.append(prediction)
+        return arrayResults
--- a/textflow/LemmaAnalyzer.py
+++ b/textflow/LemmaAnalyzer.py
+import spacy
+import spacy.cli
+from typing import Optional
+from textflow.Analyzer import Analyzer
+spacy.cli.download("es_core_news_sm")
+class LemmaAnalyzer(Analyzer):
+    def __init__(self, nlp = spacy.load("es_core_news_sm"), posNoContent = ["PUNCT", "SPACE", "SYM"]):
+        """Creates an analyzer from an input object.
+        Args:
+            function: the function of the analyzer like count word, files...
+            isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
+        """
+        self.nlp = nlp
+        self.posNoContent = posNoContent
+    #Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
+    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
+        """Analyze a sequence
+        Args:
+            sequence: the Sequence we want to analyze
+            tag: the label to store the analysis resut
+            levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
+            levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
+            analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
+        Raises:
+            ValueError if the levelOfResult is incorrect
+        """
+        super().analyze(self.lemmas,sequence, tag, levelOfAnalyzer, levelOfResult, True)
+    def lemmas(self, arrayText):
+        arrayResult = []
+        for text in arrayText:
+            sequenceLemmas = []
+            setLemmas = set()
+            sumaLenLemmas=0
+            doc= self.nlp(text)
+            for token in doc:
+                if token.pos_ not in self.posNoContent:
+                    sumaLenLemmas += len(token.lemma_)
+                    setLemmas.add(token.lemma_)
+                    sequenceLemmas.append(token.lemma_)
+            lemma={
+                "srclemmas" : sequenceLemmas,
+                "uniqueLemmas" : len(setLemmas),
+                "avgLemmas" : round(sumaLenLemmas/len(sequenceLemmas)) 
+            }
+            arrayResult.append(lemma)
+        return arrayResult
--- a/textflow/POSAnalyzer.py
+++ b/textflow/POSAnalyzer.py
+import os
+import spacy
+import spacy.cli
+from typing import Optional
+from textflow.Analyzer import Analyzer
+spacy.cli.download("es_core_news_sm")
+class POSAnalyzer(Analyzer):
+    def __init__(self, nlp = spacy.load("es_core_news_sm")):
+        """Creates an analyzer from an input object.
+        Args:
+            function: the function of the analyzer like count word, files...
+            isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
+        """
+        self.nlp = nlp
+    #Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
+    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
+        """Analyze a sequence
+        Args:
+            sequence: the Sequence we want to analyze
+            tag: the label to store the analysis resut
+            levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
+            levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
+            analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
+        Raises:
+            ValueError if the levelOfResult is incorrect
+        """
+        super().analyze(self.pos,sequence, tag, levelOfAnalyzer, levelOfResult, True)
+    def pos(self,arrayText):
+        arrayResults = []
+        for text in arrayText:
+            srcPOS = []
+            dicFreqPOS = {} 
+            doc = self.nlp(text)
+            for token in doc:
+                srcPOS.append(token.pos_)
+                if token.pos_ in dicFreqPOS:
+                    dicFreqPOS[token.pos_] += 1
+                else:
+                    dicFreqPOS[token.pos_] = 1
+            pos = {
+                "srcPOS": srcPOS,
+                "FreqPOS": dicFreqPOS
+            }
+            arrayResults.append(pos)
+        return arrayResults    
--- a/textflow/PolarityAnalyzer.py
+++ b/textflow/PolarityAnalyzer.py
+import os
+import spacy
+import spacy.cli
+from typing import Optional
+from textflow.Analyzer import Analyzer
+from transformers import pipeline
+import torch
+class PolarityAnalyzer(Analyzer):
+    def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True):
+        """Creates an analyzer from an input object.
+        Args:
+            function: the function of the analyzer like count word, files...
+            isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
+        """
+        self.polarityClassifier = pipeline(task,model= modelPolarity, return_all_scores=allScores)
+    #Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
+    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
+        """Analyze a sequence
+        Args:
+            sequence: the Sequence we want to analyze
+            tag: the label to store the analysis resut
+            levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
+            levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
+            analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
+        Raises:
+            ValueError if the levelOfResult is incorrect
+        """
+        super().analyze(self.polarity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
+    def polarity(self, arrayText):
+        arrayResults =[]
+        for text in arrayText:
+            prediction = self.polarityClassifier(text)
+            #arrayResults.append(prediction[0][0])
+            arrayResults.append(prediction)
+        return arrayResults
--- a/textflow/Sequence.py
+++ b/textflow/Sequence.py
@@ -4,6 +4,7 @@ from nltk.tokenize import TreebankWordTokenizer
 from nltk.tokenize import WhitespaceTokenizer
 from nltk.tokenize import SpaceTokenizer
 from nltk.tokenize import WordPunctTokenizer
+from nltk.tokenize import RegexpTokenizer
 class SequenceIterator:

--- a/textflow/StylometryAnalyzer.py
+++ b/textflow/StylometryAnalyzer.py
+import string
 from typing import Optional
-import spacy
+#import spacy
-import spacy.cli
+#import spacy.cli
+from nltk.text import Text
+from nltk.tokenize import WhitespaceTokenizer
+import math
+from textflow.Analyzer import Analyzer
-class StylometryyAnalyzer: #TODO
+class StylometryAnalyzer(Analyzer): #TODO
-    def __init__(self, lang = "es"):
-        if lang == "es":
+    def __init__(self,stopwords, puntuation = string.punctuation,tokenizer = WhitespaceTokenizer()):
-            spacy.cli.download("es_core_news_sm")
+        self.stopwords = stopwords
-            self.nlp = spacy.load("es_core_news_sm")
+        self.puntuation = puntuation
-        self.function = self.stylometry
+        self.tokenizer = tokenizer
-        pass
    #Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
-    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
+    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str]= ""):
-        """Analyze a sequence
+        super().analyze(self.stylometry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
-        Args:
+    def stylometry(self, arrayText):
-            sequence: the Sequence we want to analyze
+        resultsList = []
-            tag: the label to store the analysis resut
+        for t in arrayText:
-            levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
+            #doc = self.nlp(text)
-            levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
+            t.lower()
-            analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
+            tokens = self.tokenizer.tokenize (t)
+            text= [token.lower() for token in tokens]
-        Raises:
+            self.freqWords(text,self.stopwords,self.puntuation)
-            ValueError if the levelOfResult is incorrect
+            self.funcionesTTR(text)
-        """
+            result={
-        if levelOfResult == "":
+                "uniqueWords": len(self.uniqueWords),
-            analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO
+                "TTR": self.TTR,
-            resultOfAnalisys= []
+                "RTTR": self.RTTR,
-            for i in analyzeResult:
+                "Herdan": self.herdan,
-                resultOfAnalisys.append(i)
+                "Mass": self.mass,
-            sequence.metadata[tag] = resultOfAnalisys
+                "Somers": self.somers,
+                "Dugast": self.dugast,
+                "Honore": self.honore,
+                "FreqStopWords": self.freqStopWords,
+                "FreqPuntuationMarks": self.freqPuntuationMarks,
+                "FreqWords": self.freqWord
+            }
+            resultsList.append(result)
+        return resultsList
+    def funcionesTTR(self, text):
+        self.uniqueWords = [token[0] for token in self.freqWord]
+        self.numWordFreqOne = len( [token[0] for token in self.freqWord if token[1] == 1 ])
+        self.TTR = len(self.uniqueWords) / len(text)
+        self.RTTR = len(self.uniqueWords) / math.sqrt(len(text))
+        self.herdan = math.log(len(self.uniqueWords),10) / math.log(len(text),10)
+        self.mass = (math.log(len(text),10)- math.log(len(self.uniqueWords),10)) /  pow(math.log(len(self.uniqueWords),10),2)
+        self.somers = math.log(math.log(len(self.uniqueWords),10),10) / math.log(math.log(len(text),10),10)
+        if math.log(len(text),10)- math.log(len(self.uniqueWords),10) == 0:
+            self.dugast = pow(math.log(len(text),10),2)
        else:
-            children = [sequence.children]
+            self.dugast = pow(math.log(len(text),10),2) / (math.log(len(text),10)- math.log(len(self.uniqueWords),10))
-            ruta = levelOfResult.split("/")
+        if 1-(self.numWordFreqOne/len(self.uniqueWords)) == 0:
-            for r in ruta: #Para cada nivel de la ruta
+            self.honore = 100*(math.log(len(text),10))
-                for child in children: #Miramos en todas las secuencias disponibles
+        else:
-                    if r in child: #Si dentro de la secuencia actual está r
+            self.honore = 100*(math.log(len(text),10)/(1-(self.numWordFreqOne/len(self.uniqueWords))))    
-                        if r == ruta[-1]:
-                            for seq in child[r]:
-                                analyzeResult = seq.filterMetadata(levelOfAnalyzer,self.function)  
+    def freqWords(self,tokens, stopWords, puntuationMarks):
-                                resultOfAnalisys= []
+        freqStopWords = {}
-                                for i in analyzeResult:
+        freqPuntuationMarks = {}
-                                    resultOfAnalisys.append(i)
+        freqWord ={} 
-                                seq.metadata[tag] = resultOfAnalisys                           
+        for token in tokens:
-                        else:
+            if token in stopWords:
-                            children = [c.children for c in child[r]]
+                if token in freqStopWords:
-                    else:
+                    freqStopWords[token] += 1
-                        raise ValueError(f"Sequence level '{r}' not found in {child}") 
+                else:
+                    freqStopWords[token] = 1
-    def stylometry(self):
+            elif token in puntuationMarks:
-        pass
+                if token in freqPuntuationMarks:
+                    freqPuntuationMarks[token] += 1
+                else:
+                    freqPuntuationMarks[token] = 1
+            else: 
+                if token in freqWord:
+                    freqWord[token] += 1
+                else:
+                    freqWord[token] = 1
+        self.freqWord = sorted(freqWord.items(), reverse = True)
+        self.freqPuntuationMarks = sorted(freqPuntuationMarks.items(), reverse = True)
+        self.freqStopWords = sorted(freqStopWords.items(), reverse = True)   
\ No newline at end of file
--- a/textflow/VolumetryAnalyzer.py
+++ b/textflow/VolumetryAnalyzer.py
+from typing import Optional
+from textflow.Sequence import Sequence
+from nltk.tokenize import WhitespaceTokenizer
+from textflow.Analyzer import Analyzer
+class VolumetryAnalyzer(Analyzer):
+    def __init__(self, tokenizer= WhitespaceTokenizer()):
+        """Creates an analyzer from an input object.
+        Args:
+            function: the function of the analyzer like count word, files...
+            isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
+        """
+        self.tokenizer = tokenizer
+    def volumetry(self, arrayText):
+        arrayResults =[]
+        for texts in arrayText:
+            text = self.tokenizer.tokenize(texts)
+            dicResults = { 
+                "words" : len(text),
+                "uniqueWords" : len(set(text)),
+                "chars" : len(texts),
+                "avgWordsLen" : round(len(texts) / len(text))         
+            }
+            arrayResults.append(dicResults)
+        return arrayResults
+    #La secuencia siempre debe tener un atributo texto(metadata) para que este funcione
+    #Contar el numero de palabras, numero de palabras unicas, numero de caracteres y numero medio de caracteres
+    def analyze(self,sequence,tag,levelOfAnalyzer,levelOfResult:Optional[str] = ""):
+        super().analyze(self.volumetry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
+        '''children = [sequence.children]
+        ruta = levelOfAnalyze.split("/")
+        for r in ruta: #Para cada nivel de la ruta
+            for child in children: #Miramos en todas las secuencias disponibles
+                if r in child: #Si dentro de la secuencia actual está r
+                    if r == ruta[-1]:
+                        for seq in child[r]:
+                            if "text" not in seq.metadata:
+                                raise ValueError(f"Level text not found in {seq.metadata.keys()}")
+                            else:
+                                text = seq.metadata["text"].split(" ")
+                                volumetry= {
+                                    "words" : len(text),
+                                    "uniqueWords" : len(set(text)),
+                                    "chars" : len(seq.metadata["text"]),
+                                    "avgWordsLen" : round(volumetry["chars"] / volumetry["words"])
+                                }
+                                seq.metadata["volumetry"] = volumetry
+                    else:
+                        children = [c.children for c in child[r]]
+                else:
+                    raise ValueError(f"Sequence level '{r}' not found in {child}")'''
\ No newline at end of file