Implementing some analyzers

parent f3e06440
...@@ -5,14 +5,16 @@ description = "A text analysis library for Python" ...@@ -5,14 +5,16 @@ description = "A text analysis library for Python"
authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@red.ujaen.es>"] authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@red.ujaen.es>"]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.8" python = "3.8"
nltk = "^3.7" nltk = "^3.7"
spacy = "^3.3.0" spacy = "^3.3.0"
transformers = "^4.18.0" transformers = "^4.19.0"
torch = {version = "^1.11.0", python = "^3.7", platform = "linux"}
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
pytest = "^5.2" pytest = "^5.2"
[build-system] [build-system]
requires = ["poetry-core>=1.0.0"] requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
...@@ -7,11 +7,13 @@ import re ...@@ -7,11 +7,13 @@ import re
import numpy as np import numpy as np
import math import math
from functools import reduce from functools import reduce
from textflow.Analyzer import Analyzer
creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt') creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt')
class ComplexityAnalyzer: class ComplexityAnalyzer(Analyzer):
def __init__(self, lang = "es"): def __init__(self, rutaArchivoCrea = creaPath,lang = "es"):
"""Creates an analyzer from an input object. """Creates an analyzer from an input object.
Args: Args:
...@@ -22,11 +24,7 @@ class ComplexityAnalyzer: ...@@ -22,11 +24,7 @@ class ComplexityAnalyzer:
spacy.cli.download("es_core_news_sm") spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm") self.nlp = spacy.load("es_core_news_sm")
#Vamos a cargar CREA: #Vamos a cargar CREA:
self.dicFreqWords=self.read(creaPath) self.dicFreqWords=self.read(rutaArchivoCrea)
self.function = self.complexity
'''elif lang == "en":
spacy.cli.download("en_core_web_sm")
self.nlp = spacy.load("en_core_web_sm")'''
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata #Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
...@@ -42,7 +40,8 @@ class ComplexityAnalyzer: ...@@ -42,7 +40,8 @@ class ComplexityAnalyzer:
Raises: Raises:
ValueError if the levelOfResult is incorrect ValueError if the levelOfResult is incorrect
""" """
if levelOfResult == "": super().analyze(self.complexity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''if levelOfResult == "":
analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO
resultOfAnalisys= [] resultOfAnalisys= []
for i in analyzeResult: for i in analyzeResult:
...@@ -64,7 +63,7 @@ class ComplexityAnalyzer: ...@@ -64,7 +63,7 @@ class ComplexityAnalyzer:
else: else:
children = [c.children for c in child[r]] children = [c.children for c in child[r]]
else: else:
raise ValueError(f"Sequence level '{r}' not found in {child}") raise ValueError(f"Sequence level '{r}' not found in {child}") '''
def read(self,fichero): def read(self,fichero):
......
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
import torch
class EmotionAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.emotionsClassifier = pipeline(task,model=modelEmotions, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.emotions,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def emotions(self, arrayText):
arrayResults =[]
for text in arrayText:
prediction = self.emotionsClassifier(text)
#arrayResults.append(prediction[0][0])
arrayResults.append(prediction)
return arrayResults
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class LemmaAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm"), posNoContent = ["PUNCT", "SPACE", "SYM"]):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.nlp = nlp
self.posNoContent = posNoContent
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.lemmas,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def lemmas(self, arrayText):
arrayResult = []
for text in arrayText:
sequenceLemmas = []
setLemmas = set()
sumaLenLemmas=0
doc= self.nlp(text)
for token in doc:
if token.pos_ not in self.posNoContent:
sumaLenLemmas += len(token.lemma_)
setLemmas.add(token.lemma_)
sequenceLemmas.append(token.lemma_)
lemma={
"srclemmas" : sequenceLemmas,
"uniqueLemmas" : len(setLemmas),
"avgLemmas" : round(sumaLenLemmas/len(sequenceLemmas))
}
arrayResult.append(lemma)
return arrayResult
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class POSAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm")):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.nlp = nlp
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.pos,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def pos(self,arrayText):
arrayResults = []
for text in arrayText:
srcPOS = []
dicFreqPOS = {}
doc = self.nlp(text)
for token in doc:
srcPOS.append(token.pos_)
if token.pos_ in dicFreqPOS:
dicFreqPOS[token.pos_] += 1
else:
dicFreqPOS[token.pos_] = 1
pos = {
"srcPOS": srcPOS,
"FreqPOS": dicFreqPOS
}
arrayResults.append(pos)
return arrayResults
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
import torch
class PolarityAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.polarityClassifier = pipeline(task,model= modelPolarity, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.polarity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def polarity(self, arrayText):
arrayResults =[]
for text in arrayText:
prediction = self.polarityClassifier(text)
#arrayResults.append(prediction[0][0])
arrayResults.append(prediction)
return arrayResults
...@@ -4,6 +4,7 @@ from nltk.tokenize import TreebankWordTokenizer ...@@ -4,6 +4,7 @@ from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import WhitespaceTokenizer from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import SpaceTokenizer from nltk.tokenize import SpaceTokenizer
from nltk.tokenize import WordPunctTokenizer from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import RegexpTokenizer
class SequenceIterator: class SequenceIterator:
......
import string
from typing import Optional from typing import Optional
import spacy #import spacy
import spacy.cli #import spacy.cli
from nltk.text import Text
from nltk.tokenize import WhitespaceTokenizer
import math
from textflow.Analyzer import Analyzer
class StylometryyAnalyzer: #TODO class StylometryAnalyzer(Analyzer): #TODO
def __init__(self, lang = "es"):
if lang == "es": def __init__(self,stopwords, puntuation = string.punctuation,tokenizer = WhitespaceTokenizer()):
spacy.cli.download("es_core_news_sm") self.stopwords = stopwords
self.nlp = spacy.load("es_core_news_sm") self.puntuation = puntuation
self.function = self.stylometry self.tokenizer = tokenizer
pass
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata #Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str]= ""):
"""Analyze a sequence super().analyze(self.stylometry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
Args: def stylometry(self, arrayText):
sequence: the Sequence we want to analyze resultsList = []
tag: the label to store the analysis resut for t in arrayText:
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado) #doc = self.nlp(text)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion) t.lower()
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False) tokens = self.tokenizer.tokenize (t)
text= [token.lower() for token in tokens]
Raises: self.freqWords(text,self.stopwords,self.puntuation)
ValueError if the levelOfResult is incorrect self.funcionesTTR(text)
""" result={
if levelOfResult == "": "uniqueWords": len(self.uniqueWords),
analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO "TTR": self.TTR,
resultOfAnalisys= [] "RTTR": self.RTTR,
for i in analyzeResult: "Herdan": self.herdan,
resultOfAnalisys.append(i) "Mass": self.mass,
sequence.metadata[tag] = resultOfAnalisys "Somers": self.somers,
"Dugast": self.dugast,
"Honore": self.honore,
"FreqStopWords": self.freqStopWords,
"FreqPuntuationMarks": self.freqPuntuationMarks,
"FreqWords": self.freqWord
}
resultsList.append(result)
return resultsList
def funcionesTTR(self, text):
self.uniqueWords = [token[0] for token in self.freqWord]
self.numWordFreqOne = len( [token[0] for token in self.freqWord if token[1] == 1 ])
self.TTR = len(self.uniqueWords) / len(text)
self.RTTR = len(self.uniqueWords) / math.sqrt(len(text))
self.herdan = math.log(len(self.uniqueWords),10) / math.log(len(text),10)
self.mass = (math.log(len(text),10)- math.log(len(self.uniqueWords),10)) / pow(math.log(len(self.uniqueWords),10),2)
self.somers = math.log(math.log(len(self.uniqueWords),10),10) / math.log(math.log(len(text),10),10)
if math.log(len(text),10)- math.log(len(self.uniqueWords),10) == 0:
self.dugast = pow(math.log(len(text),10),2)
else: else:
children = [sequence.children] self.dugast = pow(math.log(len(text),10),2) / (math.log(len(text),10)- math.log(len(self.uniqueWords),10))
ruta = levelOfResult.split("/") if 1-(self.numWordFreqOne/len(self.uniqueWords)) == 0:
for r in ruta: #Para cada nivel de la ruta self.honore = 100*(math.log(len(text),10))
for child in children: #Miramos en todas las secuencias disponibles else:
if r in child: #Si dentro de la secuencia actual está r self.honore = 100*(math.log(len(text),10)/(1-(self.numWordFreqOne/len(self.uniqueWords))))
if r == ruta[-1]:
for seq in child[r]:
analyzeResult = seq.filterMetadata(levelOfAnalyzer,self.function) def freqWords(self,tokens, stopWords, puntuationMarks):
resultOfAnalisys= [] freqStopWords = {}
for i in analyzeResult: freqPuntuationMarks = {}
resultOfAnalisys.append(i) freqWord ={}
seq.metadata[tag] = resultOfAnalisys for token in tokens:
else: if token in stopWords:
children = [c.children for c in child[r]] if token in freqStopWords:
else: freqStopWords[token] += 1
raise ValueError(f"Sequence level '{r}' not found in {child}") else:
freqStopWords[token] = 1
def stylometry(self): elif token in puntuationMarks:
pass if token in freqPuntuationMarks:
freqPuntuationMarks[token] += 1
else:
freqPuntuationMarks[token] = 1
else:
if token in freqWord:
freqWord[token] += 1
else:
freqWord[token] = 1
self.freqWord = sorted(freqWord.items(), reverse = True)
self.freqPuntuationMarks = sorted(freqPuntuationMarks.items(), reverse = True)
self.freqStopWords = sorted(freqStopWords.items(), reverse = True)
\ No newline at end of file
from typing import Optional
from textflow.Sequence import Sequence
from nltk.tokenize import WhitespaceTokenizer
from textflow.Analyzer import Analyzer
class VolumetryAnalyzer(Analyzer):
def __init__(self, tokenizer= WhitespaceTokenizer()):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.tokenizer = tokenizer
def volumetry(self, arrayText):
arrayResults =[]
for texts in arrayText:
text = self.tokenizer.tokenize(texts)
dicResults = {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(texts),
"avgWordsLen" : round(len(texts) / len(text))
}
arrayResults.append(dicResults)
return arrayResults
#La secuencia siempre debe tener un atributo texto(metadata) para que este funcione
#Contar el numero de palabras, numero de palabras unicas, numero de caracteres y numero medio de caracteres
def analyze(self,sequence,tag,levelOfAnalyzer,levelOfResult:Optional[str] = ""):
super().analyze(self.volumetry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
text = seq.metadata["text"].split(" ")
volumetry= {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(seq.metadata["text"]),
"avgWordsLen" : round(volumetry["chars"] / volumetry["words"])
}
seq.metadata["volumetry"] = volumetry
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")'''
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment