Refactoring some classes

parent a359b5fa
Veo que en este foro, afortunadamente para vosotros, no hay mucha gente que sufra de TOC.Si hay alguien por ahí, me gustaría que compartiérais vuestras opiniones, yo compruebo las cosas que hago porque tengo miedo de haberme equivocado y pienso en las consecuencias que ese error podría acarrearme, y las compruebo una y otra vez, y esto me angustia.
Veo que en este foro, afortunadamente para vosotros, no hay mucha gente que sufra de TOC.Si hay alguien por ahí, me gustaría que compartiérais vuestras opiniones, yo compruebo las cosas que hago porque tengo miedo de haberme equivocado y pienso en las consecuencias que ese error podría acarrearme, y las compruebo una y otra vez, y esto me angustia.
Sé que abrí un post parecido hace tiempo, pero ya quedó abajo y por tanto en el olvido, por eso abro este por si alguna persona nueva con este problema lo lee.Me gustaría saber qué os recetan a vosotros para esto y si os va bien.
Saludos.
Nereida.
\ No newline at end of file
......@@ -7,6 +7,9 @@ from abc import ABC, abstractmethod
class Analyzer(ABC):
"""
Abstract class that provides methods to analyze sequences
"""
@abstractmethod
def analyze(self, functionAnalyzer,sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = "", analyzeMetadata: Optional[bool] = False): #TODO
......
......@@ -14,6 +14,40 @@ creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt')
spacy.cli.download("es_core_news_sm")
class ComplexityAnalyzer(Analyzer):
"""
A class that provides methods to analyze the complexity of the text of a sequence.
Attributes:
nlp: a model of language.
dicFreqWords: a dictionary with the most frequence words of spanish language.
numContentSentences: the number of the content sentences in the text.
numComplexSents: the nomber of complex sentences in the text.
avgLenSentence: the average length of the sentences in the text.
numPuntuationMark: the number of punctuation marks in the text.
numWords: the number of words in the text.
numRareWord: the number of rare words in the text.
numSyllabes: the number of syllabes in the text.
numChars: the number of chars in the text.
indexLowFreqWords: the index of low frequence words of a text.
lexicalDistributionIndex: the index of lexical distribution of a text.
lexicalComplexity: the index of lexical complexity of a text.
spauldingScore: the Spaulding's readability score.
sentenceComplexityIndex: the index of sentence complexity.
autoReadabilityIndex: the autoreadability index of a text.
readabilityFH: the Fernandez Huerta readability of a text.
perspicuityIFSZ: the Flesch-Szigriszt perspicuity of a text.
poliniComprensibility: the Polini comprehensibility of a text.
muLegibility: the Mu legibility of a text.
minAge: the minimum age to read a text.
solReadability: the SOL readability of a text.
crawford: the Crawford's years of a text.
min_max_list: the minimum of maximum tree depths.
max_max_list: the maximum of maximum tree depths.
mean_max_list: the mean of maximum tree depths.
"""
def __init__(self, rutaArchivoCrea = creaPath, nlp = spacy.load("es_core_news_sm")):
"""
Create a complexity analyzer from an input object.
......
......@@ -7,6 +7,13 @@ from transformers import pipeline
import torch
class EmotionAnalyzer(Analyzer):
"""
A class that provides methods to analyze the emotions of the text of a sequence.
Attributes:
polarityClassifier: a pipeline that uses a model for inference the emotions of the text of a sequence.
"""
def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True):
"""
Create a emotions analyzer.
......
......@@ -7,6 +7,15 @@ from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class LemmaAnalyzer(Analyzer):
"""
A class that provides methods to analyze the lemmas of the text of a sequence.
Attributes:
nlp: a model of language.
posNoContent: a list with the POS tag from which we don't want to get the lemma.
Some of the POS tag that haven't content are "PUNCT", "SPACE", "SYM".
"""
def __init__(self, nlp = spacy.load("es_core_news_sm"), posNoContent = ["PUNCT", "SPACE", "SYM"]):
"""Create an analyzer from an input object.
......
......@@ -7,6 +7,13 @@ from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class POSAnalyzer(Analyzer):
"""
A class that provides methods to analyze the part-of-speech(POS) of the text of a sequence.
Attributes:
nlp: a model of language.
"""
def __init__(self, nlp = spacy.load("es_core_news_sm")):
"""
Create a POS analyzer from an input object.
......
......@@ -4,6 +4,13 @@ from transformers import pipeline
import torch
class PolarityAnalyzer(Analyzer):
"""
A class that provides methods to analyze the polarity of the text of a sequence.
Attributes:
polarityClassifier: a pipeline that uses a model for inference the polarity of the text of a sequence.
"""
def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True):
"""
Create a polarity analyzer.
......
import os
from typing import Optional
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import SpaceTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import RegexpTokenizer
from abc import ABC, abstractmethod
class SequenceIterator:
"""
A class that provides methods to iterate over the children of a sequence
Attributes:
idx: an integer with the position of the iterator.
children: a dictionary with the subsequence of a sequence.
"""
def __init__(self, children):
"""
Create a sequenceIterator from a Sequence.
......@@ -39,115 +41,30 @@ class SequenceIterator:
raise StopIteration
_VALID_FORMATS = ["directory","string", "file", "token", None]
class Sequence:
"""Summary of class here.
Longer class information...
Longer class information...
class Sequence(ABC):
"""
Abstract class that provides methods to create a sequence from different formats
Attributes:
id: ...
text: ...
sequences: ...
format: a string with the origin format of a sequence.
metadata: a dictionary with the metadata of a sequence.
children: a dictionary with the subsequence of a sequence.
"""
def __init__(self, format: Optional[str] = None, src: Optional[object] = None, tokenizer: Optional[object] = None ):
"""Creates a sequence from an input object.
Args:
format: A string containing the input data's type.
src: An object representing the input data. It can be a string for a
string format or a file path for a text format.
Raises:
ValueError: If the format is wrong.
"""
if format not in _VALID_FORMATS:
raise ValueError(
f"{format} is not a valid format. Valid formats: {_VALID_FORMATS}"
)
if tokenizer == None:
tokenizer = WhitespaceTokenizer()
self.format = format
self.children = {}
self.metadata = {}
if format == "token":
if not isinstance(src, str):
raise ValueError(f"{src} is not an instance of token")
self.metadata["text"] = src
if format == "string":
self.initFromString(src,"tokens","token",tokenizer)
if format == "file":
self.initFromDocument(src,"tokens","token", tokenizer)
if format == "directory":
self.initFromDirectory(src,"directory","files",tokenizer)
def initFromDirectory(self, directory, labelDirectory, labelFile, tokenizer):
'''
Initialize a Sequence from a directory
Args:
directory: the path of a directory as string
labelDirectory: the name of the children dictionary entry for the subpaths
labelFile: the name of the children dictionary entry for the files
'''
self.format = "directory"
self.metadata["nameFiles"] = []
self.metadata["directoriesPath"] = []
contenido = os.listdir(directory)
#print(contenido)
for file in contenido:
#print(file)
if os.path.isfile(directory+"/"+file):
self.metadata["nameFiles"].append(file)
if labelFile in self.children:
self.children[labelFile].append(Sequence("file", directory+"/"+file ))
else:
self.children[labelFile]= [Sequence("file", directory+"/"+file)]
else:
self.metadata["directoriesPath"].append(directory+"/"+file)
if labelDirectory in self.children:
self.children[labelDirectory].append(Sequence("directory", directory+"/"+file,tokenizer ))
else:
self.children[labelDirectory]= [Sequence("directory", directory+"/"+file, tokenizer)]
def initFromDocument(self, documentPath, labelSubSequence, formatSubsequence, tokenizer):
@abstractmethod
def inicializeSequence(self,format):
'''
Initialize a Sequence from a document
Initializes the attributes of a sequence.
Args:
documentPath: the path of a document as string
labelSubSequence: the name of the children dictionary entry for the subsequence as string
formatSubSequence: the format of the subsequence in children dictionary entry as string
'''
self.format = "file"
with open(documentPath, "r") as f:
txt = f.read()
self.children[labelSubSequence] = [Sequence(formatSubsequence,token_src) for token_src in tokenizer.tokenize(txt)]
self.metadata["text"] = txt
def initFromString(self, srcString, labelSubSequence, formatSubsequence, tokenizer):
format: a string with the origin format of the sequence.
'''
Initialize a Sequence from a string
Args:
srcString: source string of the sequence
labelSubSequence: the name of the children dictionary entry for the subsequence as string
formatSubSequence: the format of the subsequence in children dictionary entry as string
Raises:
ValueError: If srcString isn't a string .
'''
if not isinstance(srcString, str):
raise ValueError(f"{srcString} is not an instance of string")
self.format = "string"
self.children[labelSubSequence]= [Sequence(formatSubsequence,token_src) for token_src in tokenizer.tokenize(srcString)]
self.metadata["text"]= srcString
self.format = format
self.metadata={}
self.children={}
return self.format, self.metadata, self.children
@abstractmethod
def __str__(self):
'''
Convert a Sequence to a string
......@@ -156,7 +73,8 @@ class Sequence:
A string that contains the text of a Sequence
'''
return str(self.metadata["text"])
@abstractmethod
def __repr__(self):
'''
Convert a Sequence to a string
......@@ -173,6 +91,7 @@ class Sequence:
")"
)
@abstractmethod
def __len__(self):
'''
Calculate the length of a Sequence.
......@@ -183,6 +102,7 @@ class Sequence:
'''
return len(self.children)
@abstractmethod
def __iter__(self):
'''
Iterate in a Sequence
......@@ -192,6 +112,7 @@ class Sequence:
'''
return SequenceIterator(list(self.children.values()))
@abstractmethod
def __getitem__(self, idx):
'''
Get the value of a key from the dictionary of children
......@@ -218,13 +139,23 @@ class Sequence:
else: # TODO: Should it support slices (e.g. [2:4])?
raise ValueError(f"Sequence id '{idx}' not found in {self.children}")
@abstractmethod
def __eq__(self, other):
'''
Check if a sequence it is the same that the current one.
Args:
other: a sequence to check if it is the same that the current one.
Returns:
True if the sequences are equals.
False in others cases.
'''
if self.format == other.format and self.metadata == other.metadata and self.children == other.children:
return True
else:
return False
@abstractmethod
def depth(self,diccionaryList: Optional[list] = None):
'''
Calculate the maximum depth of a Sequence
......@@ -253,7 +184,7 @@ class Sequence:
rutaMax = ruta
return (profMax, rutaMax)
@abstractmethod
def filter(self, level, criteria):
'''
Filter the children of a Sequence according to a criteria
......@@ -282,8 +213,9 @@ class Sequence:
for r in gen:
yield gen[cont]
cont+=1
def filterMetadata(self, level, criteria): #TODO
@abstractmethod
def filterMetadata(self, level, criteria):
'''
Filter the children of a Sequence according to a criteria
......@@ -298,21 +230,18 @@ class Sequence:
children = [self.children]
metadata = [self.metadata]
results=[]
if len(ruta) == 1 and ruta[0] in metadata[0]:
results.append(metadata[0][ruta[0]])
else:
for r in ruta:
if r == ruta[-1]:
for m in metadata:
if r in m:
results.append(m[r])
else:
for child in children:
if r in child:
children = [c.children for c in child[r]]
metadata = [c.metadata for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
for r in ruta:
if r == ruta[-1]:
for m in metadata:
if r in m:
results.append(m[r])
else:
for child in children:
if r in child:
children = [c.children for c in child[r]]
metadata = [c.metadata for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
cont=0
gen = criteria(results)
for r in gen:
......
import os
from typing import Optional
from nltk.tokenize import WhitespaceTokenizer
from textflow.Sequence import Sequence
from textflow.SequenceFile import SequenceFile
from textflow.SequenceString import SequenceString
class SequenceDirectory(Sequence):
"""
A class that provides methods to create a sequence from a directory
Attributes:
format: a string with the origin format of a sequence.
metadata: a dictionary with the metadata of a sequence.
children: a dictionary with the subsequence of a sequence.
"""
def __init__(self,src,listLabel = ["directories","files","tokens"],listClasses=[SequenceFile,SequenceString],listTokenizer=[WhitespaceTokenizer()]):
'''
Initialize a Sequence from a directory path
By default, create subsequences for any directories and files in the source directory
and for each file, create subsequence, splitting the text of the file into words.
Args:
srcString: source string of the sequence
labelSubSequence: the name of the children dictionary entry for the subsequence as string
formatSubSequence: the format of the subsequence in children dictionary entry as string
'''
self.inicializeSequence("directory")
self.metadata["nameFiles"] = []
self.metadata["directoriesPath"] = []
if not listTokenizer or listTokenizer == None:
listTokenizer = [WhitespaceTokenizer()]
contenido = os.listdir(src)
for file in contenido:
if os.path.isfile(src+"/"+file):
self.metadata["nameFiles"].append(file)
if listLabel and listClasses:
if listLabel[1] in self.children:
self.children[listLabel[1]].append(listClasses[0](src+"/"+file,listLabel[1:],listClasses[1:],listTokenizer[1:]))
else:
self.children[listLabel[1]] = [listClasses[0](src+"/"+file,listLabel[1:],listClasses[1:],listTokenizer[1:])]
else:
self.metadata["directoriesPath"].append(src+"/"+file)
if listLabel[0] in self.children:
self.children[listLabel[0]].append(SequenceDirectory(src+"/"+file,listLabel,listClasses,listTokenizer ))
else:
self.children[listLabel[0]] = [SequenceDirectory(src+"/"+file,listLabel,listClasses,listTokenizer)]
def inicializeSequence(self, format):
'''
Initializes the attributes of a sequence.
Args:
format: a string with the origin format of the sequence.
'''
super().inicializeSequence(format)
def __str__(self):
'''
Convert a Sequence to a string
Returns:
A string that contains the text of a Sequence
'''
return super().__str__()
def __repr__(self):
'''
Convert a Sequence to a string
Returns:
A string with the formal representation of a Sequence
'''
return super().__repr__()
def __len__(self):
'''
Calculate the length of a Sequence.
The length of a Sequence is the length of the children.
Returns:
A number with the length of the Sequence
'''
return super().__len__()
def __iter__(self):
'''
Iterate in a Sequence
To do this, we iterates througth the children dictionary
Returns:
A Sequence Iterator
'''
return super().__iter__()
def __getitem__(self, idx):
'''
Get the value of a key from the dictionary of children
Args:
idx: a string that represent the key of the children dictionary
or an integer that represent the position of the key in children dictionary keys
Returns:
A List of Sequences
'''
return super().__getitem__(idx)
def __eq__(self, other):
'''
Check if a sequence it is the same that the current one.
Args:
other: a sequence to check if it is the same that the current one.
Returns:
True if the sequences are equals.
False in others cases.
'''
return super().__eq__(other)
def depth(self,dictionaryList: Optional[list] = None):
'''
Calculate the maximum depth of a Sequence
Args:
diccionaryList: the inicial list to calculate the depth.
Returns:
A tuple that contains a number (the depth of a Sequence) and a list (the route of the max depth)
'''
return super().depth(dictionaryList)
def filter(self, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().filter(level,criteria)
def filterMetadata(self, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().filterMetadata(level,criteria)
\ No newline at end of file
import os
from typing import Optional
from nltk.tokenize import WhitespaceTokenizer
from textflow.Sequence import Sequence
from textflow.SequenceString import SequenceString
class SequenceFile (Sequence):
"""
A class that provides methods to create a sequence from a file
Attributes:
format: a string with the origin format of a sequence.
metadata: a dictionary with the metadata of a sequence.
children: a dictionary with the subsequence of a sequence.
"""
def __init__(self,src,listLabel = ["tokens"],listClasses=[SequenceString],listTokenizer=[WhitespaceTokenizer()]):
'''
Initialize a Sequence from a file path
By default, create subsequences splitting the text of the file into words.
Args:
srcString: source string of the sequence
labelSubSequence: the name of the children dictionary entry for the subsequence as string
formatSubSequence: the format of the subsequence in children dictionary entry as string
'''
self.inicializeSequence("file")
with open(src, "r") as f:
txt = f.read()
self.metadata["text"]= txt
self.metadata["nameFile"]= src.split("/")[-1]
if listLabel and listClasses:
if not listTokenizer or listTokenizer == None:
listTokenizer = [WhitespaceTokenizer()]
self.children[listLabel[0]] = [listClasses[0](token_src,listLabel[1:],listClasses[1:],listTokenizer[1:]) for token_src in listTokenizer[0].tokenize(txt)]
def inicializeSequence(self, format):
'''
Initializes the attributes of a sequence.
Args:
format: a string with the origin format of the sequence.
'''
super().inicializeSequence(format)
def __str__(self):
'''
Convert a Sequence to a string
Returns:
A string that contains the text of a Sequence
'''
return super().__str__()
def __repr__(self):
'''
Convert a Sequence to a string
Returns:
A string with the formal representation of a Sequence
'''
return super().__repr__()
def __len__(self):
'''
Calculate the length of a Sequence.
The length of a Sequence is the length of the children.
Returns:
A number with the length of the Sequence
'''
return super().__len__()
def __iter__(self):
'''
Iterate in a Sequence
To do this, we iterates througth the children dictionary
Returns:
A Sequence Iterator
'''
return super().__iter__()
def __getitem__(self, idx):
'''
Get the value of a key from the dictionary of children
Args:
idx: a string that represent the key of the children dictionary
or an integer that represent the position of the key in children dictionary keys
Returns:
A List of Sequences
'''
return super().__getitem__(idx)
def __eq__(self, other):
'''
Check if a sequence it is the same that the current one.
Args:
other: a sequence to check if it is the same that the current one.
Returns:
True if the sequences are equals.
False in others cases.
'''
return super().__eq__(other)
def depth(self,dictionaryList: Optional[list] = None):
'''
Calculate the maximum depth of a Sequence
Args:
diccionaryList: the inicial list to calculate the depth.
Returns:
A tuple that contains a number (the depth of a Sequence) and a list (the route of the max depth)
'''
return super().depth(dictionaryList)
def filter(self, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().filter(level,criteria)
def filterMetadata(self, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().filterMetadata(level,criteria)
\ No newline at end of file
import os
from typing import Optional
from nltk.tokenize import WhitespaceTokenizer
from textflow.Sequence import Sequence
from textflow.SequenceToken import SequenceToken
class SequenceString (Sequence):
"""
A class that provides methods to create a sequence from a string
Attributes:
format: a string with the origin format of a sequence.
metadata: a dictionary with the metadata of a sequence.
children: a dictionary with the subsequence of a sequence.
"""
def __init__(self,src, listLabel=["token"],listClasses=[SequenceToken],listTokenizer=[WhitespaceTokenizer()]):
'''
Initialize a Sequence from a string.
Args:
srcString: source string of the sequence
labelSubSequence: the name of the children dictionary entry for the subsequence as string
formatSubSequence: the format of the subsequence in children dictionary entry as string
Raises:
ValueError: If srcString isn't a string .
'''
self.inicializeSequence("string")
if not isinstance(src, str):
raise ValueError(f"{src} is not an instance of string")
self.metadata["text"]= src
if listLabel and listClasses:
if not listTokenizer or listTokenizer == None:
listTokenizer = [WhitespaceTokenizer()]
self.children[listLabel[0]] = [listClasses[0](token_src,listLabel[1:],listClasses[1:],listTokenizer[1:]) for token_src in listTokenizer[0].tokenize(src)]
def inicializeSequence(self, format):
'''
Initializes the attributes of a sequence.
Args:
format: a string with the origin format of the sequence.
'''
super().inicializeSequence(format)
def __str__(self):
'''
Convert a Sequence to a string
Returns:
A string that contains the text of a Sequence
'''
return super().__str__()
def __repr__(self):
'''
Convert a Sequence to a string
Returns:
A string with the formal representation of a Sequence
'''
return super().__repr__()
def __len__(self):
'''
Calculate the length of a Sequence.
The length of a Sequence is the length of the children.
Returns:
A number with the length of the Sequence
'''
return super().__len__()
def __iter__(self):
'''
Iterate in a Sequence
To do this, we iterates througth the children dictionary
Returns:
A Sequence Iterator
'''
return super().__iter__()
def __getitem__(self, idx):
'''
Get the value of a key from the dictionary of children
Args:
idx: a string that represent the key of the children dictionary
or an integer that represent the position of the key in children dictionary keys
Returns:
A List of Sequences
'''
return super().__getitem__(idx)
def __eq__(self, other):
'''
Check if a sequence it is the same that the current one.
Args:
other: a sequence to check if it is the same that the current one.
Returns:
True if the sequences are equals.
False in others cases.
'''
return super().__eq__(other)
def depth(self,dictionaryList: Optional[list] = None):
'''
Calculate the maximum depth of a Sequence
Args:
diccionaryList: the inicial list to calculate the depth.
Returns:
A tuple that contains a number (the depth of a Sequence) and a list (the route of the max depth)
'''
return super().depth(dictionaryList)
def filter(self, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().filter(level,criteria)
def filterMetadata(self, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().filterMetadata(level,criteria)
\ No newline at end of file
import os
from typing import Optional
from nltk.tokenize import WhitespaceTokenizer
from textflow.Sequence import Sequence
class SequenceToken (Sequence):
"""
A class that provides methods to create a sequence from a token
Attributes:
format: a string with the origin format of a sequence.
metadata: a dictionary with the metadata of a sequence.
children: a dictionary with the subsequence of a sequence.
"""
def __init__(self,src,listLabel,listClasses,listTokenizer=None):
'''
Initialize a Sequence from a token
Args:
srcString: source string of the sequence
labelSubSequence: the name of the children dictionary entry for the subsequence as string
formatSubSequence: the format of the subsequence in children dictionary entry as string
'''
self.inicializeSequence("token")
self.metadata["text"] = src
def inicializeSequence(self, format):
'''
Initializes the attributes of a sequence.
Args:
format: a string with the origin format of the sequence.
'''
super().inicializeSequence(format)
def __str__(self):
'''
Convert a Sequence to a string
Returns:
A string that contains the text of a Sequence
'''
return super().__str__()
def __repr__(self):
'''
Convert a Sequence to a string
Returns:
A string with the formal representation of a Sequence
'''
return super().__repr__()
def __len__(self):
'''
Calculate the length of a Sequence.
The length of a Sequence is the length of the children.
Returns:
A number with the length of the Sequence
'''
return super().__len__()
def __iter__(self):
'''
Iterate in a Sequence
To do this, we iterates througth the children dictionary
Returns:
A Sequence Iterator
'''
return super().__iter__()
def __getitem__(self, idx):
'''
Get the value of a key from the dictionary of children
Args:
idx: a string that represent the key of the children dictionary
or an integer that represent the position of the key in children dictionary keys
Returns:
A List of Sequences
'''
return super().__getitem__(idx)
def __eq__(self, other):
'''
Check if a sequence it is the same that the current one.
Args:
other: a sequence to check if it is the same that the current one.
Returns:
True if the sequences are equals.
False in others cases.
'''
return super().__eq__(other)
def depth(self,dictionaryList: Optional[list] = None):
'''
Calculate the maximum depth of a Sequence
Args:
diccionaryList: the inicial list to calculate the depth.
Returns:
A tuple that contains a number (the depth of a Sequence) and a list (the route of the max depth)
'''
return super().depth(dictionaryList)
def filter(self, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().filter(level,criteria)
def filterMetadata(self, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().filterMetadata(level,criteria)
......@@ -10,7 +10,26 @@ import math
from textflow.Analyzer import Analyzer
class StylometryAnalyzer(Analyzer):
"""
A class that provides methods to analyze the stylometry of the text of a sequence.
Attributes:
stopwords: a list with stopwords.
puntuation: a list with puntuationMarks.
tokenizer: a function to tokenize the text.
uniqueWords: a list with the vocabulary of a text.
numWordFreqOne: the numbers of words that only appear one time in the text.
TTR: type-token ratio.
RTTR: root type-token ratio.
herdan: the index of Herdan.
mass: the index of Mass.
somers: the index of Somers.
dugast: the index of Dugast.
honore: the index of Honoré.
freqStopWords: the frequence of the stopwords in the text.
freqPuntuationMarks: the frequence of the different puntuations marks in the text.
freqWord: the frequence of the different words in the text.
"""
def __init__(self,stopwords, puntuation = string.punctuation,tokenizer = WhitespaceTokenizer()):
"""
Create a stylometry analyzer from an input object.
......
......@@ -4,6 +4,12 @@ from nltk.tokenize import WhitespaceTokenizer
from textflow.Analyzer import Analyzer
class VolumetryAnalyzer(Analyzer):
"""
A class that provides methods to analyze the volumetry of the text of a sequence.
Attributes:
tokenizer: the way to split the text of a sequence in tokens.
"""
def __init__(self, tokenizer= WhitespaceTokenizer()):
"""
Create a volumetry analyzer from an input object.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment