Implementing some analyzers

parent f3e06440
......@@ -5,14 +5,16 @@ description = "A text analysis library for Python"
authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@red.ujaen.es>"]
[tool.poetry.dependencies]
python = "^3.8"
python = "3.8"
nltk = "^3.7"
spacy = "^3.3.0"
transformers = "^4.18.0"
transformers = "^4.19.0"
torch = {version = "^1.11.0", python = "^3.7", platform = "linux"}
[tool.poetry.dev-dependencies]
pytest = "^5.2"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
......@@ -7,11 +7,13 @@ import re
import numpy as np
import math
from functools import reduce
from textflow.Analyzer import Analyzer
creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt')
class ComplexityAnalyzer:
def __init__(self, lang = "es"):
class ComplexityAnalyzer(Analyzer):
def __init__(self, rutaArchivoCrea = creaPath,lang = "es"):
"""Creates an analyzer from an input object.
Args:
......@@ -22,11 +24,7 @@ class ComplexityAnalyzer:
spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm")
#Vamos a cargar CREA:
self.dicFreqWords=self.read(creaPath)
self.function = self.complexity
'''elif lang == "en":
spacy.cli.download("en_core_web_sm")
self.nlp = spacy.load("en_core_web_sm")'''
self.dicFreqWords=self.read(rutaArchivoCrea)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
......@@ -42,7 +40,8 @@ class ComplexityAnalyzer:
Raises:
ValueError if the levelOfResult is incorrect
"""
if levelOfResult == "":
super().analyze(self.complexity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''if levelOfResult == "":
analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO
resultOfAnalisys= []
for i in analyzeResult:
......@@ -64,7 +63,7 @@ class ComplexityAnalyzer:
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
raise ValueError(f"Sequence level '{r}' not found in {child}") '''
def read(self,fichero):
......
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
import torch
class EmotionAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.emotionsClassifier = pipeline(task,model=modelEmotions, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.emotions,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def emotions(self, arrayText):
arrayResults =[]
for text in arrayText:
prediction = self.emotionsClassifier(text)
#arrayResults.append(prediction[0][0])
arrayResults.append(prediction)
return arrayResults
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class LemmaAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm"), posNoContent = ["PUNCT", "SPACE", "SYM"]):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.nlp = nlp
self.posNoContent = posNoContent
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.lemmas,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def lemmas(self, arrayText):
arrayResult = []
for text in arrayText:
sequenceLemmas = []
setLemmas = set()
sumaLenLemmas=0
doc= self.nlp(text)
for token in doc:
if token.pos_ not in self.posNoContent:
sumaLenLemmas += len(token.lemma_)
setLemmas.add(token.lemma_)
sequenceLemmas.append(token.lemma_)
lemma={
"srclemmas" : sequenceLemmas,
"uniqueLemmas" : len(setLemmas),
"avgLemmas" : round(sumaLenLemmas/len(sequenceLemmas))
}
arrayResult.append(lemma)
return arrayResult
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class POSAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm")):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.nlp = nlp
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.pos,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def pos(self,arrayText):
arrayResults = []
for text in arrayText:
srcPOS = []
dicFreqPOS = {}
doc = self.nlp(text)
for token in doc:
srcPOS.append(token.pos_)
if token.pos_ in dicFreqPOS:
dicFreqPOS[token.pos_] += 1
else:
dicFreqPOS[token.pos_] = 1
pos = {
"srcPOS": srcPOS,
"FreqPOS": dicFreqPOS
}
arrayResults.append(pos)
return arrayResults
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
import torch
class PolarityAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.polarityClassifier = pipeline(task,model= modelPolarity, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.polarity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def polarity(self, arrayText):
arrayResults =[]
for text in arrayText:
prediction = self.polarityClassifier(text)
#arrayResults.append(prediction[0][0])
arrayResults.append(prediction)
return arrayResults
......@@ -4,6 +4,7 @@ from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import SpaceTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import RegexpTokenizer
class SequenceIterator:
......
import string
from typing import Optional
import spacy
import spacy.cli
#import spacy
#import spacy.cli
from nltk.text import Text
from nltk.tokenize import WhitespaceTokenizer
import math
from textflow.Analyzer import Analyzer
class StylometryyAnalyzer: #TODO
def __init__(self, lang = "es"):
if lang == "es":
spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm")
self.function = self.stylometry
pass
class StylometryAnalyzer(Analyzer): #TODO
def __init__(self,stopwords, puntuation = string.punctuation,tokenizer = WhitespaceTokenizer()):
self.stopwords = stopwords
self.puntuation = puntuation
self.tokenizer = tokenizer
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
if levelOfResult == "":
analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO
resultOfAnalisys= []
for i in analyzeResult:
resultOfAnalisys.append(i)
sequence.metadata[tag] = resultOfAnalisys
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str]= ""):
super().analyze(self.stylometry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def stylometry(self, arrayText):
resultsList = []
for t in arrayText:
#doc = self.nlp(text)
t.lower()
tokens = self.tokenizer.tokenize (t)
text= [token.lower() for token in tokens]
self.freqWords(text,self.stopwords,self.puntuation)
self.funcionesTTR(text)
result={
"uniqueWords": len(self.uniqueWords),
"TTR": self.TTR,
"RTTR": self.RTTR,
"Herdan": self.herdan,
"Mass": self.mass,
"Somers": self.somers,
"Dugast": self.dugast,
"Honore": self.honore,
"FreqStopWords": self.freqStopWords,
"FreqPuntuationMarks": self.freqPuntuationMarks,
"FreqWords": self.freqWord
}
resultsList.append(result)
return resultsList
def funcionesTTR(self, text):
self.uniqueWords = [token[0] for token in self.freqWord]
self.numWordFreqOne = len( [token[0] for token in self.freqWord if token[1] == 1 ])
self.TTR = len(self.uniqueWords) / len(text)
self.RTTR = len(self.uniqueWords) / math.sqrt(len(text))
self.herdan = math.log(len(self.uniqueWords),10) / math.log(len(text),10)
self.mass = (math.log(len(text),10)- math.log(len(self.uniqueWords),10)) / pow(math.log(len(self.uniqueWords),10),2)
self.somers = math.log(math.log(len(self.uniqueWords),10),10) / math.log(math.log(len(text),10),10)
if math.log(len(text),10)- math.log(len(self.uniqueWords),10) == 0:
self.dugast = pow(math.log(len(text),10),2)
else:
children = [sequence.children]
ruta = levelOfResult.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
analyzeResult = seq.filterMetadata(levelOfAnalyzer,self.function)
resultOfAnalisys= []
for i in analyzeResult:
resultOfAnalisys.append(i)
seq.metadata[tag] = resultOfAnalisys
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
def stylometry(self):
pass
self.dugast = pow(math.log(len(text),10),2) / (math.log(len(text),10)- math.log(len(self.uniqueWords),10))
if 1-(self.numWordFreqOne/len(self.uniqueWords)) == 0:
self.honore = 100*(math.log(len(text),10))
else:
self.honore = 100*(math.log(len(text),10)/(1-(self.numWordFreqOne/len(self.uniqueWords))))
def freqWords(self,tokens, stopWords, puntuationMarks):
freqStopWords = {}
freqPuntuationMarks = {}
freqWord ={}
for token in tokens:
if token in stopWords:
if token in freqStopWords:
freqStopWords[token] += 1
else:
freqStopWords[token] = 1
elif token in puntuationMarks:
if token in freqPuntuationMarks:
freqPuntuationMarks[token] += 1
else:
freqPuntuationMarks[token] = 1
else:
if token in freqWord:
freqWord[token] += 1
else:
freqWord[token] = 1
self.freqWord = sorted(freqWord.items(), reverse = True)
self.freqPuntuationMarks = sorted(freqPuntuationMarks.items(), reverse = True)
self.freqStopWords = sorted(freqStopWords.items(), reverse = True)
\ No newline at end of file
from typing import Optional
from textflow.Sequence import Sequence
from nltk.tokenize import WhitespaceTokenizer
from textflow.Analyzer import Analyzer
class VolumetryAnalyzer(Analyzer):
def __init__(self, tokenizer= WhitespaceTokenizer()):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.tokenizer = tokenizer
def volumetry(self, arrayText):
arrayResults =[]
for texts in arrayText:
text = self.tokenizer.tokenize(texts)
dicResults = {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(texts),
"avgWordsLen" : round(len(texts) / len(text))
}
arrayResults.append(dicResults)
return arrayResults
#La secuencia siempre debe tener un atributo texto(metadata) para que este funcione
#Contar el numero de palabras, numero de palabras unicas, numero de caracteres y numero medio de caracteres
def analyze(self,sequence,tag,levelOfAnalyzer,levelOfResult:Optional[str] = ""):
super().analyze(self.volumetry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
text = seq.metadata["text"].split(" ")
volumetry= {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(seq.metadata["text"]),
"avgWordsLen" : round(volumetry["chars"] / volumetry["words"])
}
seq.metadata["volumetry"] = volumetry
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")'''
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment