deleting some comments

parent e9f93ab1
...@@ -539,7 +539,7 @@ telegram = ["requests"] ...@@ -539,7 +539,7 @@ telegram = ["requests"]
[[package]] [[package]]
name = "transformers" name = "transformers"
version = "4.19.1" version = "4.19.2"
description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
category = "main" category = "main"
optional = false optional = false
...@@ -653,8 +653,8 @@ python-versions = "*" ...@@ -653,8 +653,8 @@ python-versions = "*"
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "3.8" python-versions = "^3.8"
content-hash = "f559d5695f1365c162f02c2146df48de52ad2d38e1b4a26476c7a662dc065365" content-hash = "469bc77da37a726f078d5ae68733d1edabc0d6e9d613f137f57fb0ffde45b43d"
[metadata.files] [metadata.files]
atomicwrites = [ atomicwrites = [
...@@ -1143,8 +1143,8 @@ tqdm = [ ...@@ -1143,8 +1143,8 @@ tqdm = [
{file = "tqdm-4.64.0.tar.gz", hash = "sha256:40be55d30e200777a307a7585aee69e4eabb46b4ec6a4b4a5f2d9f11e7d5408d"}, {file = "tqdm-4.64.0.tar.gz", hash = "sha256:40be55d30e200777a307a7585aee69e4eabb46b4ec6a4b4a5f2d9f11e7d5408d"},
] ]
transformers = [ transformers = [
{file = "transformers-4.19.1-py3-none-any.whl", hash = "sha256:16d3dd257d459c2598e2548a9e6875c10b7db5e44494d93b3c0a5c60afad667f"}, {file = "transformers-4.19.2-py3-none-any.whl", hash = "sha256:1416315b7c5ff1f56d3915f416b67aa254a9907fbb73ef7f7bffc9210446b5fa"},
{file = "transformers-4.19.1.tar.gz", hash = "sha256:6fb30ee534a25b6b3fc7064c280b7f44abf8c9bd1fb358860ebe4fd392bf15f5"}, {file = "transformers-4.19.2.tar.gz", hash = "sha256:e19a4ff07458eda143c738e5259caf48449fcf078a63d6b1bd1aa806543440a3"},
] ]
typer = [ typer = [
{file = "typer-0.4.1-py3-none-any.whl", hash = "sha256:e8467f0ebac0c81366c2168d6ad9f888efdfb6d4e1d3d5b4a004f46fa444b5c3"}, {file = "typer-0.4.1-py3-none-any.whl", hash = "sha256:e8467f0ebac0c81366c2168d6ad9f888efdfb6d4e1d3d5b4a004f46fa444b5c3"},
......
...@@ -5,7 +5,7 @@ description = "A text analysis library for Python" ...@@ -5,7 +5,7 @@ description = "A text analysis library for Python"
authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@red.ujaen.es>"] authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@red.ujaen.es>"]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "3.8" python = "^3.8"
nltk = "^3.7" nltk = "^3.7"
spacy = "^3.3.0" spacy = "^3.3.0"
transformers = "^4.19.0" transformers = "^4.19.0"
......
...@@ -10,14 +10,16 @@ class Analyzer(ABC): ...@@ -10,14 +10,16 @@ class Analyzer(ABC):
@abstractmethod @abstractmethod
def analyze(self, functionAnalyzer,sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = "", analyzeMetadata: Optional[bool] = False): #TODO def analyze(self, functionAnalyzer,sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = "", analyzeMetadata: Optional[bool] = False): #TODO
"""Analyze a sequence """
Abstract Class that analyze a sequence.
Args: Args:
sequence: the Sequence we want to analyze functionAnalyzer: the function of the analyzer.
tag: the label to store the analysis resut sequence: the Sequence we want to analyze.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado) tag: the label to store the analysis resut.
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion) levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False) levelOfResult: the path of the sequence level to store the result.
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False).
Raises: Raises:
ValueError if the levelOfResult is incorrect ValueError if the levelOfResult is incorrect
...@@ -65,7 +67,7 @@ class Analyzer(ABC): ...@@ -65,7 +67,7 @@ class Analyzer(ABC):
else: else:
raise ValueError(f"Sequence level '{r}' not found in {child}") raise ValueError(f"Sequence level '{r}' not found in {child}")
\ No newline at end of file
...@@ -11,62 +11,47 @@ from textflow.Analyzer import Analyzer ...@@ -11,62 +11,47 @@ from textflow.Analyzer import Analyzer
creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt') creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt')
spacy.cli.download("es_core_news_sm")
class ComplexityAnalyzer(Analyzer): class ComplexityAnalyzer(Analyzer):
def __init__(self, rutaArchivoCrea = creaPath,lang = "es"): def __init__(self, rutaArchivoCrea = creaPath, nlp = spacy.load("es_core_news_sm")):
"""Creates an analyzer from an input object. """
Create a complexity analyzer from an input object.
Args: Args:
function: the function of the analyzer like count word, files... rutaArchivoCrea: the file that contains the most frequence words of spanish language
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False) isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
""" """
if lang == "es": self.nlp = nlp
spacy.cli.download("es_core_news_sm") #Vamos a cargar CREA:
self.nlp = spacy.load("es_core_news_sm") self.dicFreqWords=self.read(rutaArchivoCrea)
#Vamos a cargar CREA:
self.dicFreqWords=self.read(rutaArchivoCrea)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""Analyze a sequence """
Analyze a sequence with a complexity function.
Args: Args:
sequence: the Sequence we want to analyze sequence: the Sequence we want to analyze.
tag: the label to store the analysis resut tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado) levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion) levelOfResult: the path of the sequence level to store the result.
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False) analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False).
Raises: Raises:
ValueError if the levelOfResult is incorrect ValueError if the levelOfResult is incorrect
""" """
super().analyze(self.complexity,sequence, tag, levelOfAnalyzer, levelOfResult, True) super().analyze(self.complexity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''if levelOfResult == "":
analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO
resultOfAnalisys= []
for i in analyzeResult:
resultOfAnalisys.append(i)
sequence.metadata[tag] = resultOfAnalisys
else:
children = [sequence.children]
ruta = levelOfResult.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
analyzeResult = seq.filterMetadata(levelOfAnalyzer,self.function)
resultOfAnalisys= []
for i in analyzeResult:
resultOfAnalisys.append(i)
seq.metadata[tag] = resultOfAnalisys
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}") '''
def read(self,fichero): def read(self,fichero):
"""
Function that read a txt File.
Args:
fichero: the path of the file to read.
"""
with open(fichero,'r',encoding='latin-1') as file: with open(fichero,'r',encoding='latin-1') as file:
next(file) next(file)
lines = file.readlines() lines = file.readlines()
...@@ -80,6 +65,16 @@ class ComplexityAnalyzer(Analyzer): ...@@ -80,6 +65,16 @@ class ComplexityAnalyzer(Analyzer):
def complexity(self, arrayText): def complexity(self, arrayText):
"""
Function that analyzes the complexity of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze.
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
arrayResults =[] arrayResults =[]
for text in arrayText: for text in arrayText:
doc= self.nlp (text) doc= self.nlp (text)
...@@ -123,7 +118,12 @@ class ComplexityAnalyzer(Analyzer): ...@@ -123,7 +118,12 @@ class ComplexityAnalyzer(Analyzer):
def simplesMetrics(self, doc): def simplesMetrics(self, doc):
#Simple metrics son los signos de puntuación, el numero de frases, el numero de frases con contenido... """
Function that calculate of a doc.
Args:
doc: sequence of tokens object.
"""
self.sentences = [s for s in doc.sents] self.sentences = [s for s in doc.sents]
self.numSentences = len(self.sentences) self.numSentences = len(self.sentences)
pcs = [] pcs = []
...@@ -155,13 +155,19 @@ class ComplexityAnalyzer(Analyzer): ...@@ -155,13 +155,19 @@ class ComplexityAnalyzer(Analyzer):
self.numChars = numChars self.numChars = numChars
def analyzeLegibility(self,doc): def analyzeLegibility(self,doc):
"""
Function that analyze the legibility of a text.
Args:
doc: a sequence of tokens.
"""
self.readabilityFH = 206.84 - 0.60*(self.numSyllabes/self.numWords) - 1.02*(self.numWords/self.numSentences) self.readabilityFH = 206.84 - 0.60*(self.numSyllabes/self.numWords) - 1.02*(self.numWords/self.numSentences)
self.perspicuityIFSZ = 206.835 - ((62.3*self.numSyllabes)/self.numWords) - (self.numWords/self.numSentences) self.perspicuityIFSZ = 206.835 - ((62.3*self.numSyllabes)/self.numWords) - (self.numWords/self.numSentences)
numLetters = 0 numLetters = 0
listLenLetters =[] listLenLetters =[]
for token in doc: for token in doc:
if token.text.isalpha(): #Si es una palabra if token.text.isalpha():
numLetters += len(token.text) numLetters += len(token.text)
listLenLetters.append(len(token.text)) listLenLetters.append(len(token.text))
...@@ -172,6 +178,9 @@ class ComplexityAnalyzer(Analyzer): ...@@ -172,6 +178,9 @@ class ComplexityAnalyzer(Analyzer):
self.muLegibility = (self.numWords/(self.numWords-1))*(avgLettersWords/listLenLetters.var())*100 self.muLegibility = (self.numWords/(self.numWords-1))*(avgLettersWords/listLenLetters.var())*100
def lexicalIndex(self): def lexicalIndex(self):
"""
Function that calculate different lexical index of a text.
"""
self.numContentWords = reduce((lambda a, b: a + b), [len(s) for s in self.posContentSentences]) self.numContentWords = reduce((lambda a, b: a + b), [len(s) for s in self.posContentSentences])
self.numDistinctContentWords = len(set([w.text.lower() for s in self.posContentSentences for w in s])) self.numDistinctContentWords = len(set([w.text.lower() for s in self.posContentSentences for w in s]))
if self.numContentWords == 0: if self.numContentWords == 0:
...@@ -184,10 +193,16 @@ class ComplexityAnalyzer(Analyzer): ...@@ -184,10 +193,16 @@ class ComplexityAnalyzer(Analyzer):
def readability(self): def readability(self):
"""
Function that calculate the readability of a text.
"""
self.autoReadabilityIndex = 4.71 * self.numChars / self.numWords + 0.5 * self.numWords/self.numContentSentences self.autoReadabilityIndex = 4.71 * self.numChars / self.numWords + 0.5 * self.numWords/self.numContentSentences
self.spauldingScore = 1.609*(self.numWords / self.numContentSentences) + 331.8* (self.numRareWord /self.numWords) + 22.0 self.spauldingScore = 1.609*(self.numWords / self.numContentSentences) + 331.8* (self.numRareWord /self.numWords) + 22.0
def countRareAndLowWord(self): def countRareAndLowWord(self):
"""
Function that count the rare and low words of a text.
"""
freqWord = sorted(self.dicFreqWords, key = self.dicFreqWords.__getitem__, reverse = True)[:1500] freqWord = sorted(self.dicFreqWords, key = self.dicFreqWords.__getitem__, reverse = True)[:1500]
countRareWord = 0 countRareWord = 0
countLowWord = 0 countLowWord = 0
...@@ -201,6 +216,9 @@ class ComplexityAnalyzer(Analyzer): ...@@ -201,6 +216,9 @@ class ComplexityAnalyzer(Analyzer):
self.numLowWord = countLowWord self.numLowWord = countLowWord
def sentenceComplexity(self): def sentenceComplexity(self):
"""
Function that calculate the complexity at sentence level.
"""
numComplexSentence=0 numComplexSentence=0
for sentence in self.sentences: for sentence in self.sentences:
verb = False verb = False
...@@ -224,6 +242,12 @@ class ComplexityAnalyzer(Analyzer): ...@@ -224,6 +242,12 @@ class ComplexityAnalyzer(Analyzer):
def countSyllabes(self, text): def countSyllabes(self, text):
"""
Function that count the syllabes of a text.
Args:
text: a string with the text to analyze.
"""
t = re.sub(r'y([aáeéiíoóuú])', '\\1', text.lower()) t = re.sub(r'y([aáeéiíoóuú])', '\\1', text.lower())
t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower()) t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())
t = re.sub(r'[iu][aáeyéioóu]', 'A', t).lower() t = re.sub(r'[iu][aáeyéioóu]', 'A', t).lower()
...@@ -240,6 +264,9 @@ class ComplexityAnalyzer(Analyzer): ...@@ -240,6 +264,9 @@ class ComplexityAnalyzer(Analyzer):
return 1 + max(self.treeHeight(x, cont) for x in root.children) return 1 + max(self.treeHeight(x, cont) for x in root.children)
def embeddingDepth(self): def embeddingDepth(self):
"""
Function that calculate the depth of the embedding of a text.
"""
roots = [sent.root for sent in self.sentences] roots = [sent.root for sent in self.sentences]
max_list = [] max_list = []
max_list = [self.treeHeight(root,0) for root in roots] max_list = [self.treeHeight(root,0) for root in roots]
...@@ -251,9 +278,12 @@ class ComplexityAnalyzer(Analyzer): ...@@ -251,9 +278,12 @@ class ComplexityAnalyzer(Analyzer):
return self.max_max_list, self.min_max_list, self.mean_max_list return self.max_max_list, self.min_max_list, self.mean_max_list
def ageReadability(self): def ageReadability(self):
"""
Function that calculate the age readability of a text.
"""
self.solReadability = -2.51 + 0.74*(3.1291+1.0430*math.sqrt(self.numWords3Syllabes*(30/self.numSentences))) self.solReadability = -2.51 + 0.74*(3.1291+1.0430*math.sqrt(self.numWords3Syllabes*(30/self.numSentences)))
self.minAge = 0.2495* (self.numWords/self.numSentences) + 6.4763*(self.numSyllabes/self.numWords) - 7.1395 self.minAge = 0.2495* (self.numWords/self.numSentences) + 6.4763*(self.numSyllabes/self.numWords) - 7.1395
self.crawford = -20.5*(self.numSentences/self.numWords)+4.9*(self.numSyllabes/self.numWords)-3.407 self.crawford = -20.5*(self.numSentences/self.numWords)+4.9*(self.numSyllabes/self.numWords)-3.407
pass
...@@ -8,33 +8,40 @@ import torch ...@@ -8,33 +8,40 @@ import torch
class EmotionAnalyzer(Analyzer): class EmotionAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True): def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True):
"""Creates an analyzer from an input object. """
Create a emotions analyzer.
Args: Args:
function: the function of the analyzer like count word, files... task: the task defining which pipeline will be returned.
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False) model: the model that will be used by the pipeline to make predictions.
allScores: True, if we want that the classifier returns all scores. False, in other case.
""" """
self.emotionsClassifier = pipeline(task,model=modelEmotions, return_all_scores=allScores) self.emotionsClassifier = pipeline(task,model=modelEmotions, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO """
"""Analyze a sequence Analyze a sequence with a emotion function.
Args: Args:
sequence: the Sequence we want to analyze sequence: the Sequence we want to analyze.
tag: the label to store the analysis resut tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado) levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion) levelOfResult: the path of the sequence level to store the result.
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
""" """
super().analyze(self.emotions,sequence, tag, levelOfAnalyzer, levelOfResult, True) super().analyze(self.emotions,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def emotions(self, arrayText): def emotions(self, arrayText):
"""
Function that analyzes the emotions of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze.
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
arrayResults =[] arrayResults =[]
for text in arrayText: for text in arrayText:
prediction = self.emotionsClassifier(text) prediction = self.emotionsClassifier(text)
......
...@@ -8,33 +8,39 @@ spacy.cli.download("es_core_news_sm") ...@@ -8,33 +8,39 @@ spacy.cli.download("es_core_news_sm")
class LemmaAnalyzer(Analyzer): class LemmaAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm"), posNoContent = ["PUNCT", "SPACE", "SYM"]): def __init__(self, nlp = spacy.load("es_core_news_sm"), posNoContent = ["PUNCT", "SPACE", "SYM"]):
"""Creates an analyzer from an input object. """Create an analyzer from an input object.
Args: Args:
function: the function of the analyzer like count word, files... nlp: a model of language.
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False) posNoContent: a list with the POS tag from which we don't want to get the lemma.
""" """
self.nlp = nlp self.nlp = nlp
self.posNoContent = posNoContent self.posNoContent = posNoContent
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO """
"""Analyze a sequence Analyze a sequence with a lemma function.
Args: Args:
sequence: the Sequence we want to analyze sequence: the Sequence we want to analyze.
tag: the label to store the analysis resut tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado) levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion) levelOfResult: the path of the sequence level to store the result.
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
""" """
super().analyze(self.lemmas,sequence, tag, levelOfAnalyzer, levelOfResult, True) super().analyze(self.lemmas,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def lemmas(self, arrayText): def lemmas(self, arrayText):
'''
Function that get the lemmas of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
'''
arrayResult = [] arrayResult = []
for text in arrayText: for text in arrayText:
sequenceLemmas = [] sequenceLemmas = []
......
...@@ -8,31 +8,35 @@ spacy.cli.download("es_core_news_sm") ...@@ -8,31 +8,35 @@ spacy.cli.download("es_core_news_sm")
class POSAnalyzer(Analyzer): class POSAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm")): def __init__(self, nlp = spacy.load("es_core_news_sm")):
"""Creates an analyzer from an input object. """
Create a POS analyzer from an input object.
Args: Args:
function: the function of the analyzer like count word, files... nlp: a model of language.
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
""" """
self.nlp = nlp self.nlp = nlp
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO """
"""Analyze a sequence Analyze a sequence with a POS Tagger function.
Args: Args:
sequence: the Sequence we want to analyze sequence: the Sequence we want to analyze.
tag: the label to store the analysis resut tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado) levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion) levelOfResult: the path of the sequence level to store the result.
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
""" """
super().analyze(self.pos,sequence, tag, levelOfAnalyzer, levelOfResult, True) super().analyze(self.pos,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def pos(self,arrayText): def pos(self,arrayText):
'''
Function that get the POS tag of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze.
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
'''
arrayResults = [] arrayResults = []
for text in arrayText: for text in arrayText:
srcPOS = [] srcPOS = []
......
import os
import spacy
import spacy.cli
from typing import Optional from typing import Optional
from textflow.Analyzer import Analyzer from textflow.Analyzer import Analyzer
from transformers import pipeline from transformers import pipeline
...@@ -8,32 +5,40 @@ import torch ...@@ -8,32 +5,40 @@ import torch
class PolarityAnalyzer(Analyzer): class PolarityAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True): def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True):
"""Creates an analyzer from an input object. """
Create a polarity analyzer.
Args: Args:
function: the function of the analyzer like count word, files... task: the task defining which pipeline will be returned
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False) model: the model that will be used by the pipeline to make predictions
allScores: True, if we want that the classifier returns all scores. False, in other case
""" """
self.polarityClassifier = pipeline(task,model= modelPolarity, return_all_scores=allScores) self.polarityClassifier = pipeline(task,model= modelPolarity, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""Analyze a sequence """
Analyze a sequence with a polarity function.
Args: Args:
sequence: the Sequence we want to analyze sequence: the Sequence we want to analyze.
tag: the label to store the analysis resut tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado) levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion) levelOfResult: the path of the sequence level to store the result.
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
""" """
super().analyze(self.polarity,sequence, tag, levelOfAnalyzer, levelOfResult, True) super().analyze(self.polarity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def polarity(self, arrayText): def polarity(self, arrayText):
"""
Function that analyzes the polarity of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
arrayResults =[] arrayResults =[]
for text in arrayText: for text in arrayText:
prediction = self.polarityClassifier(text) prediction = self.polarityClassifier(text)
......
...@@ -10,7 +10,7 @@ from nltk.tokenize import RegexpTokenizer ...@@ -10,7 +10,7 @@ from nltk.tokenize import RegexpTokenizer
class SequenceIterator: class SequenceIterator:
def __init__(self, children): def __init__(self, children):
""" """
Creates a sequenceIterator from a Sequence. Create a sequenceIterator from a Sequence.
Args: Args:
children: A list with the values of the attribute children of a Sequence. children: A list with the values of the attribute children of a Sequence.
""" """
...@@ -39,7 +39,7 @@ class SequenceIterator: ...@@ -39,7 +39,7 @@ class SequenceIterator:
raise StopIteration raise StopIteration
_VALID_FORMATS = ["directory","string", "text", "token", None] _VALID_FORMATS = ["directory","string", "file", "token", None]
class Sequence: class Sequence:
"""Summary of class here. """Summary of class here.
...@@ -59,6 +59,7 @@ class Sequence: ...@@ -59,6 +59,7 @@ class Sequence:
format: A string containing the input data's type. format: A string containing the input data's type.
src: An object representing the input data. It can be a string for a src: An object representing the input data. It can be a string for a
string format or a file path for a text format. string format or a file path for a text format.
Raises: Raises:
ValueError: If the format is wrong. ValueError: If the format is wrong.
""" """
...@@ -71,14 +72,14 @@ class Sequence: ...@@ -71,14 +72,14 @@ class Sequence:
self.format = format self.format = format
self.children = {} self.children = {}
self.metadata = {"text": " "} self.metadata = {}
if format == "token": if format == "token":
if not isinstance(src, str): if not isinstance(src, str):
raise ValueError(f"{src} is not an instance of token") raise ValueError(f"{src} is not an instance of token")
self.metadata["text"] = src self.metadata["text"] = src
if format == "string": if format == "string":
self.initFromString(src,"tokens","token",tokenizer) self.initFromString(src,"tokens","token",tokenizer)
if format == "text": if format == "file":
self.initFromDocument(src,"tokens","token", tokenizer) self.initFromDocument(src,"tokens","token", tokenizer)
if format == "directory": if format == "directory":
self.initFromDirectory(src,"directory","files",tokenizer) self.initFromDirectory(src,"directory","files",tokenizer)
...@@ -86,12 +87,12 @@ class Sequence: ...@@ -86,12 +87,12 @@ class Sequence:
def initFromDirectory(self, directory, labelDirectory, labelFile, tokenizer): def initFromDirectory(self, directory, labelDirectory, labelFile, tokenizer):
''' '''
Initialize a Sequence from a directory Initialize a Sequence from a directory
Args: Args:
directory: the path of a directory as string directory: the path of a directory as string
labelDirectory: the name of the children dictionary entry for the subpaths labelDirectory: the name of the children dictionary entry for the subpaths
labelFile: the name of the children dictionary entry for the files labelFile: the name of the children dictionary entry for the files
''' '''
#print(os.path.abspath((os.getcwd())))
self.format = "directory" self.format = "directory"
self.metadata["nameFiles"] = [] self.metadata["nameFiles"] = []
self.metadata["directoriesPath"] = [] self.metadata["directoriesPath"] = []
...@@ -102,9 +103,9 @@ class Sequence: ...@@ -102,9 +103,9 @@ class Sequence:
if os.path.isfile(directory+"/"+file): if os.path.isfile(directory+"/"+file):
self.metadata["nameFiles"].append(file) self.metadata["nameFiles"].append(file)
if labelFile in self.children: if labelFile in self.children:
self.children[labelFile].append(Sequence("text", directory+"/"+file )) self.children[labelFile].append(Sequence("file", directory+"/"+file ))
else: else:
self.children[labelFile]= [Sequence("text", directory+"/"+file)] self.children[labelFile]= [Sequence("file", directory+"/"+file)]
else: else:
self.metadata["directoriesPath"].append(directory+"/"+file) self.metadata["directoriesPath"].append(directory+"/"+file)
if labelDirectory in self.children: if labelDirectory in self.children:
...@@ -116,12 +117,13 @@ class Sequence: ...@@ -116,12 +117,13 @@ class Sequence:
def initFromDocument(self, documentPath, labelSubSequence, formatSubsequence, tokenizer): def initFromDocument(self, documentPath, labelSubSequence, formatSubsequence, tokenizer):
''' '''
Initialize a Sequence from a document Initialize a Sequence from a document
Args: Args:
documentPath: the path of a document as string documentPath: the path of a document as string
labelSubSequence: the name of the children dictionary entry for the subsequence as string labelSubSequence: the name of the children dictionary entry for the subsequence as string
formatSubSequence: the format of the subsequence in children dictionary entry as string formatSubSequence: the format of the subsequence in children dictionary entry as string
''' '''
self.format = "text" self.format = "file"
with open(documentPath, "r") as f: with open(documentPath, "r") as f:
txt = f.read() txt = f.read()
self.children[labelSubSequence] = [Sequence(formatSubsequence,token_src) for token_src in tokenizer.tokenize(txt)] self.children[labelSubSequence] = [Sequence(formatSubsequence,token_src) for token_src in tokenizer.tokenize(txt)]
...@@ -130,10 +132,12 @@ class Sequence: ...@@ -130,10 +132,12 @@ class Sequence:
def initFromString(self, srcString, labelSubSequence, formatSubsequence, tokenizer): def initFromString(self, srcString, labelSubSequence, formatSubsequence, tokenizer):
''' '''
Initialize a Sequence from a string Initialize a Sequence from a string
Args: Args:
srcString: source string of the sequence srcString: source string of the sequence
labelSubSequence: the name of the children dictionary entry for the subsequence as string labelSubSequence: the name of the children dictionary entry for the subsequence as string
formatSubSequence: the format of the subsequence in children dictionary entry as string formatSubSequence: the format of the subsequence in children dictionary entry as string
Raises: Raises:
ValueError: If srcString isn't a string . ValueError: If srcString isn't a string .
''' '''
...@@ -171,8 +175,9 @@ class Sequence: ...@@ -171,8 +175,9 @@ class Sequence:
def __len__(self): def __len__(self):
''' '''
Calculate the length of a Sequence Calculate the length of a Sequence.
The length of a Sequence is the length of the children. The length of a Sequence is the length of the children.
Returns: Returns:
A number with the length of the Sequence A number with the length of the Sequence
''' '''
...@@ -190,6 +195,7 @@ class Sequence: ...@@ -190,6 +195,7 @@ class Sequence:
def __getitem__(self, idx): def __getitem__(self, idx):
''' '''
Get the value of a key from the dictionary of children Get the value of a key from the dictionary of children
Args: Args:
idx: a string that represent the key of the children dictionary idx: a string that represent the key of the children dictionary
or an integer that represent the position of the key in children dictionary keys or an integer that represent the position of the key in children dictionary keys
...@@ -307,7 +313,6 @@ class Sequence: ...@@ -307,7 +313,6 @@ class Sequence:
metadata = [c.metadata for c in child[r]] metadata = [c.metadata for c in child[r]]
else: else:
raise ValueError(f"Sequence level '{r}' not found in {child}") raise ValueError(f"Sequence level '{r}' not found in {child}")
#yield criteria(results)
cont=0 cont=0
gen = criteria(results) gen = criteria(results)
for r in gen: for r in gen:
......
...@@ -9,21 +9,46 @@ import math ...@@ -9,21 +9,46 @@ import math
from textflow.Analyzer import Analyzer from textflow.Analyzer import Analyzer
class StylometryAnalyzer(Analyzer): #TODO class StylometryAnalyzer(Analyzer):
def __init__(self,stopwords, puntuation = string.punctuation,tokenizer = WhitespaceTokenizer()): def __init__(self,stopwords, puntuation = string.punctuation,tokenizer = WhitespaceTokenizer()):
"""
Create a stylometry analyzer from an input object.
Args:
stopwords: a list with stopwords
puntuation: a list with puntuationMarks
tokenizer: a function to tokenize the text
"""
self.stopwords = stopwords self.stopwords = stopwords
self.puntuation = puntuation self.puntuation = puntuation
self.tokenizer = tokenizer self.tokenizer = tokenizer
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str]= ""): def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str]= ""):
"""
Analyze a sequence with a stylometry function.
Args:
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.stylometry,sequence, tag, levelOfAnalyzer, levelOfResult, True) super().analyze(self.stylometry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def stylometry(self, arrayText): def stylometry(self, arrayText):
'''
Function that get the stylometry (somes index, frequence of words ) of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
'''
resultsList = [] resultsList = []
for t in arrayText: for t in arrayText:
#doc = self.nlp(text)
t.lower() t.lower()
tokens = self.tokenizer.tokenize (t) tokens = self.tokenizer.tokenize (t)
text= [token.lower() for token in tokens] text= [token.lower() for token in tokens]
...@@ -46,6 +71,12 @@ class StylometryAnalyzer(Analyzer): #TODO ...@@ -46,6 +71,12 @@ class StylometryAnalyzer(Analyzer): #TODO
return resultsList return resultsList
def funcionesTTR(self, text): def funcionesTTR(self, text):
"""
Function that calculate different TTR index.
Args:
text: a string with the text to analyze.
"""
self.uniqueWords = [token[0] for token in self.freqWord] self.uniqueWords = [token[0] for token in self.freqWord]
self.numWordFreqOne = len( [token[0] for token in self.freqWord if token[1] == 1 ]) self.numWordFreqOne = len( [token[0] for token in self.freqWord if token[1] == 1 ])
self.TTR = len(self.uniqueWords) / len(text) self.TTR = len(self.uniqueWords) / len(text)
...@@ -64,6 +95,14 @@ class StylometryAnalyzer(Analyzer): #TODO ...@@ -64,6 +95,14 @@ class StylometryAnalyzer(Analyzer): #TODO
def freqWords(self,tokens, stopWords, puntuationMarks): def freqWords(self,tokens, stopWords, puntuationMarks):
"""
Function that count the frequence of stopWords, puntuationMarks and words of a list of tokens.
Args:
tokens: a list of tokens that we want to count the frequence.
stopwords: a list with the stopwords.
puntuationMarks: a list with the puntuation marks.
"""
freqStopWords = {} freqStopWords = {}
freqPuntuationMarks = {} freqPuntuationMarks = {}
freqWord ={} freqWord ={}
......
...@@ -5,16 +5,25 @@ from textflow.Analyzer import Analyzer ...@@ -5,16 +5,25 @@ from textflow.Analyzer import Analyzer
class VolumetryAnalyzer(Analyzer): class VolumetryAnalyzer(Analyzer):
def __init__(self, tokenizer= WhitespaceTokenizer()): def __init__(self, tokenizer= WhitespaceTokenizer()):
"""Creates an analyzer from an input object. """
Create a volumetry analyzer from an input object.
Args: Args:
function: the function of the analyzer like count word, files... tokenizer: the way to split a text into token
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
""" """
self.tokenizer = tokenizer self.tokenizer = tokenizer
def volumetry(self, arrayText): def volumetry(self, arrayText):
"""
Function that analyzes the volumetry of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze.
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
arrayResults =[] arrayResults =[]
for texts in arrayText: for texts in arrayText:
text = self.tokenizer.tokenize(texts) text = self.tokenizer.tokenize(texts)
...@@ -27,33 +36,18 @@ class VolumetryAnalyzer(Analyzer): ...@@ -27,33 +36,18 @@ class VolumetryAnalyzer(Analyzer):
arrayResults.append(dicResults) arrayResults.append(dicResults)
return arrayResults return arrayResults
#La secuencia siempre debe tener un atributo texto(metadata) para que este funcione
#Contar el numero de palabras, numero de palabras unicas, numero de caracteres y numero medio de caracteres
def analyze(self,sequence,tag,levelOfAnalyzer,levelOfResult:Optional[str] = ""): def analyze(self,sequence,tag,levelOfAnalyzer,levelOfResult:Optional[str] = ""):
super().analyze(self.volumetry,sequence, tag, levelOfAnalyzer, levelOfResult, True) """
'''children = [sequence.children] Analyze a sequence with a volumetry function.
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
text = seq.metadata["text"].split(" ")
volumetry= {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(seq.metadata["text"]),
"avgWordsLen" : round(volumetry["chars"] / volumetry["words"])
}
seq.metadata["volumetry"] = volumetry Args:
else: sequence: the Sequence we want to analyze.
children = [c.children for c in child[r]] tag: the label to store the analysis result.
else: levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
raise ValueError(f"Sequence level '{r}' not found in {child}")''' levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.volumetry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment