deleting some comments

parent e9f93ab1
......@@ -539,7 +539,7 @@ telegram = ["requests"]
[[package]]
name = "transformers"
version = "4.19.1"
version = "4.19.2"
description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
category = "main"
optional = false
......@@ -653,8 +653,8 @@ python-versions = "*"
[metadata]
lock-version = "1.1"
python-versions = "3.8"
content-hash = "f559d5695f1365c162f02c2146df48de52ad2d38e1b4a26476c7a662dc065365"
python-versions = "^3.8"
content-hash = "469bc77da37a726f078d5ae68733d1edabc0d6e9d613f137f57fb0ffde45b43d"
[metadata.files]
atomicwrites = [
......@@ -1143,8 +1143,8 @@ tqdm = [
{file = "tqdm-4.64.0.tar.gz", hash = "sha256:40be55d30e200777a307a7585aee69e4eabb46b4ec6a4b4a5f2d9f11e7d5408d"},
]
transformers = [
{file = "transformers-4.19.1-py3-none-any.whl", hash = "sha256:16d3dd257d459c2598e2548a9e6875c10b7db5e44494d93b3c0a5c60afad667f"},
{file = "transformers-4.19.1.tar.gz", hash = "sha256:6fb30ee534a25b6b3fc7064c280b7f44abf8c9bd1fb358860ebe4fd392bf15f5"},
{file = "transformers-4.19.2-py3-none-any.whl", hash = "sha256:1416315b7c5ff1f56d3915f416b67aa254a9907fbb73ef7f7bffc9210446b5fa"},
{file = "transformers-4.19.2.tar.gz", hash = "sha256:e19a4ff07458eda143c738e5259caf48449fcf078a63d6b1bd1aa806543440a3"},
]
typer = [
{file = "typer-0.4.1-py3-none-any.whl", hash = "sha256:e8467f0ebac0c81366c2168d6ad9f888efdfb6d4e1d3d5b4a004f46fa444b5c3"},
......
......@@ -5,7 +5,7 @@ description = "A text analysis library for Python"
authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@red.ujaen.es>"]
[tool.poetry.dependencies]
python = "3.8"
python = "^3.8"
nltk = "^3.7"
spacy = "^3.3.0"
transformers = "^4.19.0"
......
......@@ -10,14 +10,16 @@ class Analyzer(ABC):
@abstractmethod
def analyze(self, functionAnalyzer,sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = "", analyzeMetadata: Optional[bool] = False): #TODO
"""Analyze a sequence
"""
Abstract Class that analyze a sequence.
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
functionAnalyzer: the function of the analyzer.
sequence: the Sequence we want to analyze.
tag: the label to store the analysis resut.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False).
Raises:
ValueError if the levelOfResult is incorrect
......@@ -65,7 +67,7 @@ class Analyzer(ABC):
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
\ No newline at end of file
......@@ -11,62 +11,47 @@ from textflow.Analyzer import Analyzer
creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt')
spacy.cli.download("es_core_news_sm")
class ComplexityAnalyzer(Analyzer):
def __init__(self, rutaArchivoCrea = creaPath,lang = "es"):
"""Creates an analyzer from an input object.
def __init__(self, rutaArchivoCrea = creaPath, nlp = spacy.load("es_core_news_sm")):
"""
Create a complexity analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
rutaArchivoCrea: the file that contains the most frequence words of spanish language
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
if lang == "es":
spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm")
#Vamos a cargar CREA:
self.dicFreqWords=self.read(rutaArchivoCrea)
self.nlp = nlp
#Vamos a cargar CREA:
self.dicFreqWords=self.read(rutaArchivoCrea)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""
Analyze a sequence with a complexity function.
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False).
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.complexity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''if levelOfResult == "":
analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO
resultOfAnalisys= []
for i in analyzeResult:
resultOfAnalisys.append(i)
sequence.metadata[tag] = resultOfAnalisys
else:
children = [sequence.children]
ruta = levelOfResult.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
analyzeResult = seq.filterMetadata(levelOfAnalyzer,self.function)
resultOfAnalisys= []
for i in analyzeResult:
resultOfAnalisys.append(i)
seq.metadata[tag] = resultOfAnalisys
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}") '''
def read(self,fichero):
"""
Function that read a txt File.
Args:
fichero: the path of the file to read.
"""
with open(fichero,'r',encoding='latin-1') as file:
next(file)
lines = file.readlines()
......@@ -80,6 +65,16 @@ class ComplexityAnalyzer(Analyzer):
def complexity(self, arrayText):
"""
Function that analyzes the complexity of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze.
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
arrayResults =[]
for text in arrayText:
doc= self.nlp (text)
......@@ -123,7 +118,12 @@ class ComplexityAnalyzer(Analyzer):
def simplesMetrics(self, doc):
#Simple metrics son los signos de puntuación, el numero de frases, el numero de frases con contenido...
"""
Function that calculate of a doc.
Args:
doc: sequence of tokens object.
"""
self.sentences = [s for s in doc.sents]
self.numSentences = len(self.sentences)
pcs = []
......@@ -155,13 +155,19 @@ class ComplexityAnalyzer(Analyzer):
self.numChars = numChars
def analyzeLegibility(self,doc):
"""
Function that analyze the legibility of a text.
Args:
doc: a sequence of tokens.
"""
self.readabilityFH = 206.84 - 0.60*(self.numSyllabes/self.numWords) - 1.02*(self.numWords/self.numSentences)
self.perspicuityIFSZ = 206.835 - ((62.3*self.numSyllabes)/self.numWords) - (self.numWords/self.numSentences)
numLetters = 0
listLenLetters =[]
for token in doc:
if token.text.isalpha(): #Si es una palabra
if token.text.isalpha():
numLetters += len(token.text)
listLenLetters.append(len(token.text))
......@@ -172,6 +178,9 @@ class ComplexityAnalyzer(Analyzer):
self.muLegibility = (self.numWords/(self.numWords-1))*(avgLettersWords/listLenLetters.var())*100
def lexicalIndex(self):
"""
Function that calculate different lexical index of a text.
"""
self.numContentWords = reduce((lambda a, b: a + b), [len(s) for s in self.posContentSentences])
self.numDistinctContentWords = len(set([w.text.lower() for s in self.posContentSentences for w in s]))
if self.numContentWords == 0:
......@@ -184,10 +193,16 @@ class ComplexityAnalyzer(Analyzer):
def readability(self):
"""
Function that calculate the readability of a text.
"""
self.autoReadabilityIndex = 4.71 * self.numChars / self.numWords + 0.5 * self.numWords/self.numContentSentences
self.spauldingScore = 1.609*(self.numWords / self.numContentSentences) + 331.8* (self.numRareWord /self.numWords) + 22.0
def countRareAndLowWord(self):
"""
Function that count the rare and low words of a text.
"""
freqWord = sorted(self.dicFreqWords, key = self.dicFreqWords.__getitem__, reverse = True)[:1500]
countRareWord = 0
countLowWord = 0
......@@ -201,6 +216,9 @@ class ComplexityAnalyzer(Analyzer):
self.numLowWord = countLowWord
def sentenceComplexity(self):
"""
Function that calculate the complexity at sentence level.
"""
numComplexSentence=0
for sentence in self.sentences:
verb = False
......@@ -224,6 +242,12 @@ class ComplexityAnalyzer(Analyzer):
def countSyllabes(self, text):
"""
Function that count the syllabes of a text.
Args:
text: a string with the text to analyze.
"""
t = re.sub(r'y([aáeéiíoóuú])', '\\1', text.lower())
t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())
t = re.sub(r'[iu][aáeyéioóu]', 'A', t).lower()
......@@ -240,6 +264,9 @@ class ComplexityAnalyzer(Analyzer):
return 1 + max(self.treeHeight(x, cont) for x in root.children)
def embeddingDepth(self):
"""
Function that calculate the depth of the embedding of a text.
"""
roots = [sent.root for sent in self.sentences]
max_list = []
max_list = [self.treeHeight(root,0) for root in roots]
......@@ -251,9 +278,12 @@ class ComplexityAnalyzer(Analyzer):
return self.max_max_list, self.min_max_list, self.mean_max_list
def ageReadability(self):
"""
Function that calculate the age readability of a text.
"""
self.solReadability = -2.51 + 0.74*(3.1291+1.0430*math.sqrt(self.numWords3Syllabes*(30/self.numSentences)))
self.minAge = 0.2495* (self.numWords/self.numSentences) + 6.4763*(self.numSyllabes/self.numWords) - 7.1395
self.crawford = -20.5*(self.numSentences/self.numWords)+4.9*(self.numSyllabes/self.numWords)-3.407
pass
......@@ -8,33 +8,40 @@ import torch
class EmotionAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True):
"""Creates an analyzer from an input object.
"""
Create a emotions analyzer.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
task: the task defining which pipeline will be returned.
model: the model that will be used by the pipeline to make predictions.
allScores: True, if we want that the classifier returns all scores. False, in other case.
"""
self.emotionsClassifier = pipeline(task,model=modelEmotions, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""
Analyze a sequence with a emotion function.
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.emotions,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def emotions(self, arrayText):
"""
Function that analyzes the emotions of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze.
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
arrayResults =[]
for text in arrayText:
prediction = self.emotionsClassifier(text)
......
......@@ -8,33 +8,39 @@ spacy.cli.download("es_core_news_sm")
class LemmaAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm"), posNoContent = ["PUNCT", "SPACE", "SYM"]):
"""Creates an analyzer from an input object.
"""Create an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
nlp: a model of language.
posNoContent: a list with the POS tag from which we don't want to get the lemma.
"""
self.nlp = nlp
self.posNoContent = posNoContent
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""
Analyze a sequence with a lemma function.
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.lemmas,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def lemmas(self, arrayText):
'''
Function that get the lemmas of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
'''
arrayResult = []
for text in arrayText:
sequenceLemmas = []
......
......@@ -8,31 +8,35 @@ spacy.cli.download("es_core_news_sm")
class POSAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm")):
"""Creates an analyzer from an input object.
"""
Create a POS analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
nlp: a model of language.
"""
self.nlp = nlp
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""
Analyze a sequence with a POS Tagger function.
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.pos,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def pos(self,arrayText):
'''
Function that get the POS tag of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze.
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
'''
arrayResults = []
for text in arrayText:
srcPOS = []
......
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
......@@ -8,32 +5,40 @@ import torch
class PolarityAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True):
"""Creates an analyzer from an input object.
"""
Create a polarity analyzer.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
task: the task defining which pipeline will be returned
model: the model that will be used by the pipeline to make predictions
allScores: True, if we want that the classifier returns all scores. False, in other case
"""
self.polarityClassifier = pipeline(task,model= modelPolarity, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""):
"""
Analyze a sequence with a polarity function.
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.polarity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def polarity(self, arrayText):
"""
Function that analyzes the polarity of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
arrayResults =[]
for text in arrayText:
prediction = self.polarityClassifier(text)
......
......@@ -10,7 +10,7 @@ from nltk.tokenize import RegexpTokenizer
class SequenceIterator:
def __init__(self, children):
"""
Creates a sequenceIterator from a Sequence.
Create a sequenceIterator from a Sequence.
Args:
children: A list with the values of the attribute children of a Sequence.
"""
......@@ -39,7 +39,7 @@ class SequenceIterator:
raise StopIteration
_VALID_FORMATS = ["directory","string", "text", "token", None]
_VALID_FORMATS = ["directory","string", "file", "token", None]
class Sequence:
"""Summary of class here.
......@@ -59,6 +59,7 @@ class Sequence:
format: A string containing the input data's type.
src: An object representing the input data. It can be a string for a
string format or a file path for a text format.
Raises:
ValueError: If the format is wrong.
"""
......@@ -71,14 +72,14 @@ class Sequence:
self.format = format
self.children = {}
self.metadata = {"text": " "}
self.metadata = {}
if format == "token":
if not isinstance(src, str):
raise ValueError(f"{src} is not an instance of token")
self.metadata["text"] = src
if format == "string":
self.initFromString(src,"tokens","token",tokenizer)
if format == "text":
if format == "file":
self.initFromDocument(src,"tokens","token", tokenizer)
if format == "directory":
self.initFromDirectory(src,"directory","files",tokenizer)
......@@ -86,12 +87,12 @@ class Sequence:
def initFromDirectory(self, directory, labelDirectory, labelFile, tokenizer):
'''
Initialize a Sequence from a directory
Args:
directory: the path of a directory as string
labelDirectory: the name of the children dictionary entry for the subpaths
labelFile: the name of the children dictionary entry for the files
'''
#print(os.path.abspath((os.getcwd())))
self.format = "directory"
self.metadata["nameFiles"] = []
self.metadata["directoriesPath"] = []
......@@ -102,9 +103,9 @@ class Sequence:
if os.path.isfile(directory+"/"+file):
self.metadata["nameFiles"].append(file)
if labelFile in self.children:
self.children[labelFile].append(Sequence("text", directory+"/"+file ))
self.children[labelFile].append(Sequence("file", directory+"/"+file ))
else:
self.children[labelFile]= [Sequence("text", directory+"/"+file)]
self.children[labelFile]= [Sequence("file", directory+"/"+file)]
else:
self.metadata["directoriesPath"].append(directory+"/"+file)
if labelDirectory in self.children:
......@@ -116,12 +117,13 @@ class Sequence:
def initFromDocument(self, documentPath, labelSubSequence, formatSubsequence, tokenizer):
'''
Initialize a Sequence from a document
Args:
documentPath: the path of a document as string
labelSubSequence: the name of the children dictionary entry for the subsequence as string
formatSubSequence: the format of the subsequence in children dictionary entry as string
'''
self.format = "text"
self.format = "file"
with open(documentPath, "r") as f:
txt = f.read()
self.children[labelSubSequence] = [Sequence(formatSubsequence,token_src) for token_src in tokenizer.tokenize(txt)]
......@@ -130,10 +132,12 @@ class Sequence:
def initFromString(self, srcString, labelSubSequence, formatSubsequence, tokenizer):
'''
Initialize a Sequence from a string
Args:
srcString: source string of the sequence
labelSubSequence: the name of the children dictionary entry for the subsequence as string
formatSubSequence: the format of the subsequence in children dictionary entry as string
Raises:
ValueError: If srcString isn't a string .
'''
......@@ -171,8 +175,9 @@ class Sequence:
def __len__(self):
'''
Calculate the length of a Sequence
Calculate the length of a Sequence.
The length of a Sequence is the length of the children.
Returns:
A number with the length of the Sequence
'''
......@@ -190,6 +195,7 @@ class Sequence:
def __getitem__(self, idx):
'''
Get the value of a key from the dictionary of children
Args:
idx: a string that represent the key of the children dictionary
or an integer that represent the position of the key in children dictionary keys
......@@ -307,7 +313,6 @@ class Sequence:
metadata = [c.metadata for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
#yield criteria(results)
cont=0
gen = criteria(results)
for r in gen:
......
......@@ -9,21 +9,46 @@ import math
from textflow.Analyzer import Analyzer
class StylometryAnalyzer(Analyzer): #TODO
class StylometryAnalyzer(Analyzer):
def __init__(self,stopwords, puntuation = string.punctuation,tokenizer = WhitespaceTokenizer()):
"""
Create a stylometry analyzer from an input object.
Args:
stopwords: a list with stopwords
puntuation: a list with puntuationMarks
tokenizer: a function to tokenize the text
"""
self.stopwords = stopwords
self.puntuation = puntuation
self.tokenizer = tokenizer
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str]= ""):
"""
Analyze a sequence with a stylometry function.
Args:
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.stylometry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def stylometry(self, arrayText):
'''
Function that get the stylometry (somes index, frequence of words ) of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
'''
resultsList = []
for t in arrayText:
#doc = self.nlp(text)
t.lower()
tokens = self.tokenizer.tokenize (t)
text= [token.lower() for token in tokens]
......@@ -46,6 +71,12 @@ class StylometryAnalyzer(Analyzer): #TODO
return resultsList
def funcionesTTR(self, text):
"""
Function that calculate different TTR index.
Args:
text: a string with the text to analyze.
"""
self.uniqueWords = [token[0] for token in self.freqWord]
self.numWordFreqOne = len( [token[0] for token in self.freqWord if token[1] == 1 ])
self.TTR = len(self.uniqueWords) / len(text)
......@@ -64,6 +95,14 @@ class StylometryAnalyzer(Analyzer): #TODO
def freqWords(self,tokens, stopWords, puntuationMarks):
"""
Function that count the frequence of stopWords, puntuationMarks and words of a list of tokens.
Args:
tokens: a list of tokens that we want to count the frequence.
stopwords: a list with the stopwords.
puntuationMarks: a list with the puntuation marks.
"""
freqStopWords = {}
freqPuntuationMarks = {}
freqWord ={}
......
......@@ -5,16 +5,25 @@ from textflow.Analyzer import Analyzer
class VolumetryAnalyzer(Analyzer):
def __init__(self, tokenizer= WhitespaceTokenizer()):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
Create a volumetry analyzer from an input object.
Args:
tokenizer: the way to split a text into token
"""
self.tokenizer = tokenizer
def volumetry(self, arrayText):
"""
Function that analyzes the volumetry of a list of texts.
Args:
arrayText: list that contains the texts that we want to analyze.
Returns:
A list with the dictionaries. Each dictionary contains the result
of the analysis of the corresponding text.
"""
arrayResults =[]
for texts in arrayText:
text = self.tokenizer.tokenize(texts)
......@@ -27,33 +36,18 @@ class VolumetryAnalyzer(Analyzer):
arrayResults.append(dicResults)
return arrayResults
#La secuencia siempre debe tener un atributo texto(metadata) para que este funcione
#Contar el numero de palabras, numero de palabras unicas, numero de caracteres y numero medio de caracteres
def analyze(self,sequence,tag,levelOfAnalyzer,levelOfResult:Optional[str] = ""):
super().analyze(self.volumetry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
text = seq.metadata["text"].split(" ")
volumetry= {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(seq.metadata["text"]),
"avgWordsLen" : round(volumetry["chars"] / volumetry["words"])
}
"""
Analyze a sequence with a volumetry function.
seq.metadata["volumetry"] = volumetry
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")'''
Args:
sequence: the Sequence we want to analyze.
tag: the label to store the analysis result.
levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
levelOfResult: the path of the sequence level to store the result.
"""
super().analyze(self.volumetry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment