new analyzers added

61cb8472 · Estrella Vallecillo · fd498956 · 61cb8472 · fd498956 · 61cb8472
Commit 61cb8472 authored Jun 22, 2022 by Estrella Vallecillo
Showing with 381 additions and 9 deletions
README.md
README.rst
textflow/EmojiAnalyzer.py
textflow/EmotionAnalyzer.py
textflow/IronityAnalyzer.py
textflow/NERAnalyzer.py
textflow/NGramsAnalyzer.py
textflow/PolarityAnalyzer.py
textflow/SequenceToken.py
textflow/StylometryAnalyzer.py
--- a/README.md
+++ b/README.md
@@ -36,6 +36,14 @@ This class provides methods to create a sequence from directories, documents, st

 - **[VolumetryAnalyzer.py](https://gitlab.ujaen.es/jcollado/textflow/blob/master/textflow/VolumetryAnalyzer.py):** This module provides a class methods for the calculation of different volumetry metrics on a sequence. This class inherits from Analyzer.py

+- **[IronityAnalyzer.py](https://gitlab.ujaen.es/jcollado/textflow/blob/master/textflow/IronityAnalyzer.py):** This module provides a class methods for the calculation of the ironity on a sequence. This class inherits from Analyzer.py
+
+- **[NERAnalyzer.py](https://gitlab.ujaen.es/jcollado/textflow/blob/master/textflow/NERAnalyzer.py):** This module provides a class methods for the search of different NER on a sequence. This class inherits from Analyzer.py
+
+- **[NGramsAnalyzer.py](https://gitlab.ujaen.es/jcollado/textflow/blob/master/textflow/NGramsAnalyzer.py):** This module provides a class methods for the calculation of n-grams and their frequence on a sequence. This class inherits from Analyzer.py
+
+- **[EmojiAnalyzer.py](https://gitlab.ujaen.es/jcollado/textflow/blob/master/textflow/EmojiAnalyzer.py.py):** This module provides a class methods for the calculation of different emojis metrics on a sequence. This class inherits from Analyzer.py
+
 **Note:** All of the analyzers implemented by default are applied to plain text.

 ### ./examples
@@ -59,12 +67,22 @@ In this section, we introduce the different metrics offered in this Python libra

 - **Stylometry:** Number of different words, different lexical index (TTR,RTT, Herdan, Mass, Somers, Dugast, Honore), frequency of stopwords, frequency of punctuation marks, frequency of words.

- **Polarity:** Polarity score of a text
+- **Polarity:** Polarity score of a text.
+
+- **Emotions:** Emotions score of a text.

- **Emotions:** Emotions score of a text
+- **Emojis:** Number of emojis of the text, their frequence and the text with the words of the emojis instead of the emoji.
+
+- **NER:** the frequence of different entities, the entities grouped by each category and the text with entities instead of the words.
+
+- **N-Grams:** the different n-grams of the text and their frequence.
+
+- **Ironity:** Ironity score of a text.

 # Dependencies

- **ComplexityAnalyzer.py, POSAnalyzer, LemmaAnalyzer:** In these classes, spacy is used to calculate the different metrics of the analyzers. If do you want to use other package, you should implements the methods nlp, sents, pos_, lemma_ and text.
+- **ComplexityAnalyzer.py, POSAnalyzer, LemmaAnalyzer and NERAnalyzer:** In these classes, spacy is used to calculate the different metrics of the analyzers. If do you want to use other package, you should implements the methods nlp, sents, pos_, lemma_ and text.
+
+- **IronityAnalizer.py, EmotionAnalyzer.py and PolarityAnalyzer.py:** These classes use models and pipelines of transformers, you can use different models to inference the emotion or the polarity of a text.

- **EmotionAnalyzer.py and PolarityAnalyzer.py:** These classes use models and pipelines of transformers, you can use different models to inference the emotion or the polarity of a text.
+- **EmojiAnalizer.py:** This class use emoji library.
--- a/README.rst
+++ b/README.rst
--- a/textflow/EmojiAnalyzer.py
+++ b/textflow/EmojiAnalyzer.py
+from typing import Optional
+import emoji
+from textflow.Analyzer import Analyzer
+import re
+
+class EmojiAnalyzer(Analyzer):
+    """
+    A class that provides methods to analyze the different emojis of the text of a sequence.
+
+    Attributes:
+        language: the languague of text to analyze the emojis.
+        textEmojis: the text with the words of the emojis instead of emojis.
+        freqEmoji: a dictionary with the different emojis and their frequence.
+        nEmoji: the number of emojis that appear in the text to analyze.
+    """
+
+    def __init__(self, language='es'):
+        """
+        Create an emoji analyzer.
+
+        Args:
+            language: the languague of text to analyze the emojis.
+        """
+        self.language=language
+        
+    
+    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): 
+        """
+        Analyze a sequence with a emoji function.
+
+        Args:
+            sequence: the Sequence we want to analyze.
+            tag: the label to store the analysis result.
+            levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
+            levelOfResult: the path of the sequence level to store the result.
+        """
+        super().analyze(self.emojiResult,sequence, tag, levelOfAnalyzer, levelOfResult, True)
+
+    def emojiResult(self, arrayText):
+        """
+        Function that analyzes the emojis of a list of texts.
+
+        Args:
+            arrayText: list that contains the texts that we want to analyze
+        Returns:
+            A list with the dictionaries. Each dictionary contains the result
+            of the analysis of the corresponding text.
+        """
+        resultsList = []
+        for t in arrayText:
+            t.lower()
+            self.countEmoji(t)
+            result={
+                "TextWithoutEmoji": self.textEmojis,
+                "FreqEmoji": self.freqEmoji,
+                "NumEmojis": self.nEmoji
+            }
+            resultsList.append(result)
+        return resultsList
+
+    def countEmoji(self, text):
+        """
+        Function that counts the number of emojis that appear in the text, their frequency and
+        changes the corresponding emoji for its meaning in words
+
+        Args:
+            text: the text that we want to analyze
+        """
+        self.freqEmoji={}
+        textNoEmoji = emoji.demojize(text, language=self.language)
+        emojis = re.findall(r':[\w][_\w.]*:',textNoEmoji)
+        self.nEmoji = len(emojis)
+        for emo in emojis:
+            if emo in self.freqEmoji:
+                self.freqEmoji[emo] += 1
+            else:
+                self.freqEmoji[emo] = 1
+        self.textEmojis=re.sub(r':[\w][_\w.]*:', self.repl_func, textNoEmoji)
+
+    def repl_func(self, match):
+        """
+        Function that replace the match of a string.
+
+        Args:
+            match: the match object with the pattern
+
+        Returns:
+            The match with ' ' instead of ':' and with ' ' instead of '_'
+        """
+        return match[0].replace(":"," ").replace("_"," ")
+        
+
+
--- a/textflow/EmotionAnalyzer.py
+++ b/textflow/EmotionAnalyzer.py
@@ -4,7 +4,6 @@ import spacy.cli
 from typing import Optional
 from textflow.Analyzer import Analyzer
 from transformers import pipeline
-import torch

 class EmotionAnalyzer(Analyzer):
    """

--- a/textflow/IronityAnalyzer.py
+++ b/textflow/IronityAnalyzer.py
+from typing import Optional
+from textflow.Analyzer import Analyzer
+from transformers import pipeline
+from transformers import AutoModelForSequenceClassification,AutoTokenizer
+
+
+class IronityAnalyzer(Analyzer):
+    """
+    A class that provides methods to analyze the ironity of the text of a sequence.
+
+    Attributes:
+       ironityClassifier: a pipeline that uses a model for inference the ironity of the text of a sequence. 
+                          By default, the label 'NI' is non-ironic and 'I' ironic.
+        maxEmbedding: The number of max_position_embedings in the config.json of the model selected.
+    """
+
+    def __init__(self, task = "text-classification",modelIronity = 'dtomas/roberta-base-bne-irony', allScores = True, maxEmbedding = 512):
+        """
+        Create an ironic analyzer.
+
+        Args:
+            task: the task defining which pipeline will be returned.
+            model: the model that will be used by the pipeline to make predictions.
+            allScores: True, if we want that the classifier returns all scores. False, in other case.
+            maxEmbedding: The number of max_position_embedings in the config.json of the model selected.
+        """
+        if modelIronity == 'dtomas/roberta-base-bne-irony':
+            model = AutoModelForSequenceClassification.from_pretrained(modelIronity)
+            model.config.id2label = {0: 'NI', 1: 'I'}
+            model.config.label2id = {'NI': 0, 'I': 1}
+            tokenizer = AutoTokenizer.from_pretrained(modelIronity, model_max_length=512)
+            self.ironityClassifier = pipeline(task,model= model, tokenizer=tokenizer,return_all_scores=allScores)
+        else:
+            self.ironityClassifier = pipeline(task,model= modelIronity, return_all_scores=allScores)
+        self.maxEmbeding = maxEmbedding
+        
+
+    
+    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): 
+        """
+        Analyze a sequence with a ironic function.
+
+        Args:
+            sequence: the Sequence we want to analyze.
+            tag: the label to store the analysis result.
+            levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
+            levelOfResult: the path of the sequence level to store the result.
+        """
+        super().analyze(self.polarity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
+
+    def polarity(self, arrayText):
+        """
+        Function that analyzes the ironity of a list of texts.
+
+        Args:
+            arrayText: list that contains the texts that we want to analyze
+        Returns:
+            A list with the dictionaries. Each dictionary contains the result
+            of the analysis of the corresponding text.
+        """
+        arrayResults =[]
+        for text in arrayText:
+            prediction = self.ironityClassifier(text[:self.maxEmbeding])
+            arrayResults.append(prediction)
+        return arrayResults
+
--- a/textflow/NERAnalyzer.py
+++ b/textflow/NERAnalyzer.py
+
+import spacy
+import spacy.cli
+from typing import Optional
+from textflow.Analyzer import Analyzer
+
+spacy.cli.download("es_core_news_sm")
+
+class NERAnalyzer(Analyzer):
+    """
+    A class that provides methods to analyze the NER of the text of a sequence.
+
+    Attributes:
+        nlp: a model of language.
+        textNER:the text with the entities instead of the words.
+        dicEntidades:a dictionary with the entities.
+        dicEntidadesFrecuencia: a dictionary with the frequence of the different entities.
+    """
+
+    def __init__(self, nlp = spacy.load("es_core_news_sm")):
+        """
+        Create a NER analyzer from an input object.
+
+        Args:
+            nlp: a model of language.
+        """
+        self.nlp = nlp
+
+    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): 
+        """
+        Analyze a sequence with a NER function.
+
+        Args:
+            sequence: the Sequence we want to analyze.
+            tag: the label to store the analysis result.
+            levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
+            levelOfResult: the path of the sequence level to store the result. 
+        """
+        super().analyze(self.nerAnalyzer,sequence, tag, levelOfAnalyzer, levelOfResult, True)
+
+
+    def nerAnalyzer(self, arrayText):
+        '''
+        Function that get the NER of a list of texts.
+
+        Args:
+            arrayText: list that contains the texts that we want to analyze
+        
+        Returns:
+            A list with the dictionaries. Each dictionary contains the result
+            of the analysis of the corresponding text.
+        '''
+        arrayResult = []
+        for text in arrayText:
+            self.freqNer(text)
+            ner={
+                "srcNER" : self.textNER,
+                "entidades" : self.dicEntidades,
+                "freqEntidades" : self.dicEntidadesFrecuencia
+            }
+            arrayResult.append(ner)
+        return arrayResult
+
+    def freqNer(self,text):
+        """
+        Function that counts the number of the different categories of NER that appear in the text, their frequency and
+        changes the corresponding word by its NER category
+
+        Args:
+            text: the text that we want to analyze
+        """
+        self.dicEntidades= {}
+        self.dicEntidadesFrecuencia = {}
+        doc = self.nlp(text)
+        textner=[]
+        for i in range(len(doc)):
+            if doc[i].ent_type_ != '':
+                textner.append(doc[i].ent_type_)
+            else:
+                textner.append(doc[i].text)
+        print(textner)
+        self.textNER = " ".join(textner) 
+        for ent in doc.ents:
+            #Guardamos el diccionario obtenido para la categoria de la palabra (si este existe)
+            dicPalabras = self.dicEntidades.get(ent.label_)
+            
+            #Si hay un diccionario, es decir es una categoría que ha aparecido previamente
+            if dicPalabras != None:
+                #Aumentamos la frecuencia de aparición en esta categoría
+                self.dicEntidadesFrecuencia[ent.label_] += 1
+                #introducimos en el diccionario la palabra
+                if ent.text.lower() in dicPalabras:
+                    dicPalabras[ent.text.lower()] += 1
+                else:
+                    dicPalabras[ent.text.lower()] = 1
+
+            #Si es igual de none, no tenemos esa categoria
+            else:
+                #Creamos el diccionario para esta categoria
+                palabrasFrecuencia ={}
+                #Insertamos la palabra actual en el diccionario
+                palabrasFrecuencia[ent.text.lower()] = 1
+                #Insertamos el diccionario dentro del diccionario de categorias para la categoria asociada
+                self.dicEntidades[ent.label_] = palabrasFrecuencia
+                #Ponemos a uno la frecuencia de aparición para esa categoría de entidad
+                self.dicEntidadesFrecuencia[ent.label_] = 1
+
+
+
+            
+                                
+                                
+
+
+
--- a/textflow/NGramsAnalyzer.py
+++ b/textflow/NGramsAnalyzer.py
+from typing import Optional
+from textflow.Analyzer import Analyzer
+from nltk.tokenize import WhitespaceTokenizer
+from nltk.corpus import stopwords
+import sklearn.feature_extraction.text
+
+class NGramsAnalyzer(Analyzer):
+    """
+    A class that provides methods to analyze the n-grams of the text of a sequence.
+
+    Attributes:
+        stopwords: a list with stopwords.
+        tokenizer: a function to tokenize the text.
+        ngramsSize: a number with the size of the n-grams.
+        listOfNGrams: a list witn the n-grams of the text to analyze.
+        freqNGrams: a dictionary with the different n-grams and their frequence in the text to analyze.
+    """
+
+    def __init__(self, stopwords=stopwords.words('spanish'), tokenizer = WhitespaceTokenizer(), ngramsSize = 2):
+        """
+        Create a n-grams analyzer.
+
+        Args:
+            stopwords: a list with stopwords.
+            tokenizer: a function to tokenize the text.
+            ngramsSize: a number with the size of the n-grams.
+        """
+        self.stopwords = stopwords
+        self.tokenizer = tokenizer
+        self.ngramsSize = ngramsSize
+        
+
+    
+    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): 
+        """
+        Analyze a sequence with a n-grams function.
+
+        Args:
+            sequence: the Sequence we want to analyze.
+            tag: the label to store the analysis result.
+            levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
+            levelOfResult: the path of the sequence level to store the result.
+        """
+        super().analyze(self.ngrams,sequence, tag, levelOfAnalyzer, levelOfResult, True)
+
+    def ngrams(self, arrayText):
+        """
+        Function that analyzes the n-grams of a list of texts.
+
+        Args:
+            arrayText: list that contains the texts that we want to analyze
+        Returns:
+            A list with the dictionaries. Each dictionary contains the result
+            of the analysis of the corresponding text.
+        """
+        arrayResults =[]
+        for text in arrayText:
+            self.countFreqNGrams(text)
+            prediction = {
+                'n-grams': self.listOfNGrams,
+                'freqN-Grams': self.freqNGrams
+            }
+            arrayResults.append(prediction)
+        return arrayResults
+
+    def countFreqNGrams(self,text):
+        """
+        Function that divide the text in n-grams, and count the frequence of them.
+    
+        Args:
+            text: a string/text to analyze
+        """
+        vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(self.ngramsSize,self.ngramsSize),tokenizer=self.tokenizer.tokenize,stop_words= self.stopwords)
+        text=[text]
+        vect.fit(text)
+        self.listOfNGrams = vect.get_feature_names_out().tolist()
+        dicfreq={}
+        for i in self.listOfNGrams:
+            if i in dicfreq:
+                dicfreq[i] += 1
+            else:
+                dicfreq[i] = 1
+        self.freqNGrams = dicfreq
+
--- a/textflow/PolarityAnalyzer.py
+++ b/textflow/PolarityAnalyzer.py
 from typing import Optional
 from textflow.Analyzer import Analyzer
 from transformers import pipeline
-import torch

 class PolarityAnalyzer(Analyzer):
    """

--- a/textflow/SequenceToken.py
+++ b/textflow/SequenceToken.py
@@ -36,7 +36,7 @@ class SequenceToken (Sequence):
        Args:
            format: a string with the origin format of the sequence.
        '''
-        super().inicializeSequence(format)
+        super().initializeSequence(format)

    def __str__(self):
        '''

--- a/textflow/StylometryAnalyzer.py
+++ b/textflow/StylometryAnalyzer.py
 import string
 from typing import Optional

-#import spacy
-#import spacy.cli
 from nltk.text import Text
 from nltk.tokenize import WhitespaceTokenizer
 import math