new analyzers added

parent fd498956
......@@ -36,6 +36,14 @@ This class provides methods to create a sequence from directories, documents, st
- **[VolumetryAnalyzer.py](https://gitlab.ujaen.es/jcollado/textflow/blob/master/textflow/VolumetryAnalyzer.py):** This module provides a class methods for the calculation of different volumetry metrics on a sequence. This class inherits from Analyzer.py
- **[IronityAnalyzer.py](https://gitlab.ujaen.es/jcollado/textflow/blob/master/textflow/IronityAnalyzer.py):** This module provides a class methods for the calculation of the ironity on a sequence. This class inherits from Analyzer.py
- **[NERAnalyzer.py](https://gitlab.ujaen.es/jcollado/textflow/blob/master/textflow/NERAnalyzer.py):** This module provides a class methods for the search of different NER on a sequence. This class inherits from Analyzer.py
- **[NGramsAnalyzer.py](https://gitlab.ujaen.es/jcollado/textflow/blob/master/textflow/NGramsAnalyzer.py):** This module provides a class methods for the calculation of n-grams and their frequence on a sequence. This class inherits from Analyzer.py
- **[EmojiAnalyzer.py](https://gitlab.ujaen.es/jcollado/textflow/blob/master/textflow/EmojiAnalyzer.py.py):** This module provides a class methods for the calculation of different emojis metrics on a sequence. This class inherits from Analyzer.py
**Note:** All of the analyzers implemented by default are applied to plain text.
### ./examples
......@@ -59,12 +67,22 @@ In this section, we introduce the different metrics offered in this Python libra
- **Stylometry:** Number of different words, different lexical index (TTR,RTT, Herdan, Mass, Somers, Dugast, Honore), frequency of stopwords, frequency of punctuation marks, frequency of words.
- **Polarity:** Polarity score of a text
- **Polarity:** Polarity score of a text.
- **Emotions:** Emotions score of a text.
- **Emotions:** Emotions score of a text
- **Emojis:** Number of emojis of the text, their frequence and the text with the words of the emojis instead of the emoji.
- **NER:** the frequence of different entities, the entities grouped by each category and the text with entities instead of the words.
- **N-Grams:** the different n-grams of the text and their frequence.
- **Ironity:** Ironity score of a text.
# Dependencies
- **ComplexityAnalyzer.py, POSAnalyzer, LemmaAnalyzer:** In these classes, spacy is used to calculate the different metrics of the analyzers. If do you want to use other package, you should implements the methods nlp, sents, pos_, lemma_ and text.
- **ComplexityAnalyzer.py, POSAnalyzer, LemmaAnalyzer and NERAnalyzer:** In these classes, spacy is used to calculate the different metrics of the analyzers. If do you want to use other package, you should implements the methods nlp, sents, pos_, lemma_ and text.
- **IronityAnalizer.py, EmotionAnalyzer.py and PolarityAnalyzer.py:** These classes use models and pipelines of transformers, you can use different models to inference the emotion or the polarity of a text.
- **EmotionAnalyzer.py and PolarityAnalyzer.py:** These classes use models and pipelines of transformers, you can use different models to inference the emotion or the polarity of a text.
- **EmojiAnalizer.py:** This class use emoji library.
File mode changed
from typing import Optional
import emoji
from textflow.Analyzer import Analyzer
import re
class EmojiAnalyzer(Analyzer):
"""
A class that provides methods to analyze the different emojis of the text of a sequence.
Attributes:
language: the languague of text to analyze the emojis.
textEmojis: the text with the words of the emojis instead of emojis.
freqEmoji: a dictionary with the different emojis and their frequence.
nEmoji: the number of emojis that appear in the text to analyze.
"""
def __init__(self, language='es'):
"""
Create an emoji analyzer.
Args:
language: the languague of text to analyze the emojis.
"""
self.language=language
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""
Analyze a sequence with a emoji function.
Args:
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.emojiResult,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def emojiResult(self, arrayText):
"""
Function that analyzes the emojis of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
resultsList = []
for t in arrayText:
t.lower()
self.countEmoji(t)
result={
"TextWithoutEmoji": self.textEmojis,
"FreqEmoji": self.freqEmoji,
"NumEmojis": self.nEmoji
}
resultsList.append(result)
return resultsList
def countEmoji(self, text):
"""
Function that counts the number of emojis that appear in the text, their frequency and
changes the corresponding emoji for its meaning in words
Args:
text: the text that we want to analyze
"""
self.freqEmoji={}
textNoEmoji = emoji.demojize(text, language=self.language)
emojis = re.findall(r':[\w][_\w.]*:',textNoEmoji)
self.nEmoji = len(emojis)
for emo in emojis:
if emo in self.freqEmoji:
self.freqEmoji[emo] += 1
else:
self.freqEmoji[emo] = 1
self.textEmojis=re.sub(r':[\w][_\w.]*:', self.repl_func, textNoEmoji)
def repl_func(self, match):
"""
Function that replace the match of a string.
Args:
match: the match object with the pattern
Returns:
The match with ' ' instead of ':' and with ' ' instead of '_'
"""
return match[0].replace(":"," ").replace("_"," ")
......@@ -4,7 +4,6 @@ import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
import torch
class EmotionAnalyzer(Analyzer):
"""
......
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
from transformers import AutoModelForSequenceClassification,AutoTokenizer
class IronityAnalyzer(Analyzer):
"""
A class that provides methods to analyze the ironity of the text of a sequence.
Attributes:
ironityClassifier: a pipeline that uses a model for inference the ironity of the text of a sequence.
By default, the label 'NI' is non-ironic and 'I' ironic.
maxEmbedding: The number of max_position_embedings in the config.json of the model selected.
"""
def __init__(self, task = "text-classification",modelIronity = 'dtomas/roberta-base-bne-irony', allScores = True, maxEmbedding = 512):
"""
Create an ironic analyzer.
Args:
task: the task defining which pipeline will be returned.
model: the model that will be used by the pipeline to make predictions.
allScores: True, if we want that the classifier returns all scores. False, in other case.
maxEmbedding: The number of max_position_embedings in the config.json of the model selected.
"""
if modelIronity == 'dtomas/roberta-base-bne-irony':
model = AutoModelForSequenceClassification.from_pretrained(modelIronity)
model.config.id2label = {0: 'NI', 1: 'I'}
model.config.label2id = {'NI': 0, 'I': 1}
tokenizer = AutoTokenizer.from_pretrained(modelIronity, model_max_length=512)
self.ironityClassifier = pipeline(task,model= model, tokenizer=tokenizer,return_all_scores=allScores)
else:
self.ironityClassifier = pipeline(task,model= modelIronity, return_all_scores=allScores)
self.maxEmbeding = maxEmbedding
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""
Analyze a sequence with a ironic function.
Args:
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.polarity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def polarity(self, arrayText):
"""
Function that analyzes the ironity of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
arrayResults =[]
for text in arrayText:
prediction = self.ironityClassifier(text[:self.maxEmbeding])
arrayResults.append(prediction)
return arrayResults
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class NERAnalyzer(Analyzer):
"""
A class that provides methods to analyze the NER of the text of a sequence.
Attributes:
nlp: a model of language.
textNER:the text with the entities instead of the words.
dicEntidades:a dictionary with the entities.
dicEntidadesFrecuencia: a dictionary with the frequence of the different entities.
"""
def __init__(self, nlp = spacy.load("es_core_news_sm")):
"""
Create a NER analyzer from an input object.
Args:
nlp: a model of language.
"""
self.nlp = nlp
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""
Analyze a sequence with a NER function.
Args:
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.nerAnalyzer,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def nerAnalyzer(self, arrayText):
'''
Function that get the NER of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
'''
arrayResult = []
for text in arrayText:
self.freqNer(text)
ner={
"srcNER" : self.textNER,
"entidades" : self.dicEntidades,
"freqEntidades" : self.dicEntidadesFrecuencia
}
arrayResult.append(ner)
return arrayResult
def freqNer(self,text):
"""
Function that counts the number of the different categories of NER that appear in the text, their frequency and
changes the corresponding word by its NER category
Args:
text: the text that we want to analyze
"""
self.dicEntidades= {}
self.dicEntidadesFrecuencia = {}
doc = self.nlp(text)
textner=[]
for i in range(len(doc)):
if doc[i].ent_type_ != '':
textner.append(doc[i].ent_type_)
else:
textner.append(doc[i].text)
print(textner)
self.textNER = " ".join(textner)
for ent in doc.ents:
#Guardamos el diccionario obtenido para la categoria de la palabra (si este existe)
dicPalabras = self.dicEntidades.get(ent.label_)
#Si hay un diccionario, es decir es una categoría que ha aparecido previamente
if dicPalabras != None:
#Aumentamos la frecuencia de aparición en esta categoría
self.dicEntidadesFrecuencia[ent.label_] += 1
#introducimos en el diccionario la palabra
if ent.text.lower() in dicPalabras:
dicPalabras[ent.text.lower()] += 1
else:
dicPalabras[ent.text.lower()] = 1
#Si es igual de none, no tenemos esa categoria
else:
#Creamos el diccionario para esta categoria
palabrasFrecuencia ={}
#Insertamos la palabra actual en el diccionario
palabrasFrecuencia[ent.text.lower()] = 1
#Insertamos el diccionario dentro del diccionario de categorias para la categoria asociada
self.dicEntidades[ent.label_] = palabrasFrecuencia
#Ponemos a uno la frecuencia de aparición para esa categoría de entidad
self.dicEntidadesFrecuencia[ent.label_] = 1
from typing import Optional
from textflow.Analyzer import Analyzer
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
import sklearn.feature_extraction.text
class NGramsAnalyzer(Analyzer):
"""
A class that provides methods to analyze the n-grams of the text of a sequence.
Attributes:
stopwords: a list with stopwords.
tokenizer: a function to tokenize the text.
ngramsSize: a number with the size of the n-grams.
listOfNGrams: a list witn the n-grams of the text to analyze.
freqNGrams: a dictionary with the different n-grams and their frequence in the text to analyze.
"""
def __init__(self, stopwords=stopwords.words('spanish'), tokenizer = WhitespaceTokenizer(), ngramsSize = 2):
"""
Create a n-grams analyzer.
Args:
stopwords: a list with stopwords.
tokenizer: a function to tokenize the text.
ngramsSize: a number with the size of the n-grams.
"""
self.stopwords = stopwords
self.tokenizer = tokenizer
self.ngramsSize = ngramsSize
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""
Analyze a sequence with a n-grams function.
Args:
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.ngrams,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def ngrams(self, arrayText):
"""
Function that analyzes the n-grams of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
arrayResults =[]
for text in arrayText:
self.countFreqNGrams(text)
prediction = {
'n-grams': self.listOfNGrams,
'freqN-Grams': self.freqNGrams
}
arrayResults.append(prediction)
return arrayResults
def countFreqNGrams(self,text):
"""
Function that divide the text in n-grams, and count the frequence of them.
Args:
text: a string/text to analyze
"""
vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(self.ngramsSize,self.ngramsSize),tokenizer=self.tokenizer.tokenize,stop_words= self.stopwords)
text=[text]
vect.fit(text)
self.listOfNGrams = vect.get_feature_names_out().tolist()
dicfreq={}
for i in self.listOfNGrams:
if i in dicfreq:
dicfreq[i] += 1
else:
dicfreq[i] = 1
self.freqNGrams = dicfreq
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
import torch
class PolarityAnalyzer(Analyzer):
"""
......
......@@ -36,7 +36,7 @@ class SequenceToken (Sequence):
Args:
format: a string with the origin format of the sequence.
'''
super().inicializeSequence(format)
super().initializeSequence(format)
def __str__(self):
'''
......
import string
from typing import Optional
#import spacy
#import spacy.cli
from nltk.text import Text
from nltk.tokenize import WhitespaceTokenizer
import math
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment