Add new file

parent 8f206f7b
Showing with 280 additions and 0 deletions
from functools import reduce
import math
import syllables
class TextComplexitySpacy():
def __init__(self, lang='es'):
self.lang = lang
# create language analyzer
if lang == 'es':
self.nlp = es_core_news_sm.load()
if lang == 'en':
self.nlp = spacy.load("en_core_web_sm")
# Para leer el texto que introducimos
f = open(crea_total_path)
lines = f.readlines()
f.close()
crea = {}
for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words
data = l.strip().split()
crea[data[1]] = float(data[2].replace(',', ''))
self.crea = crea
pass
def textProcessing(self, text):
# Meter todas las funciones en una patron de los tokens válidos
doc = self.nlp(text)
self.tokens = [w for w in doc]
self.sentences = [sent for sent in doc.sents]
self.nsentences = len(self.sentences)
'''
Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos
'''
pos_content_sentences = []
for sentence in self.sentences:
ws = self.nlp(sentence.text)
pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)])
self.pos_content_sentences = pos_content_sentences
return self.pos_content_sentences
def punctuationMarks(self):
# Solo nos interesa contar los tokens que sean signo de puntuación.
# Number of words.
punctuation = []
N_words = []
for w in self.tokens:
if re.match('PUNCT.*', w.pos_):
punctuation.append(w.text)
else:
N_words.append(w.text)
aux = len(N_words)
if aux == 0:
aux = 1
self.N_words = aux
self.npunctuation = len(punctuation)
self.punctuation = punctuation
return self.npunctuation, self.punctuation, self.N_words
def lexicalComplexity(self):
# Number of low frequency words
count = 0
for sentence in self.pos_content_sentences:
for w in sentence:
if w.text not in self.crea:
count+=1
N_lfw = count
self.N_lfw = N_lfw
# Number of distinct content words
N_dcw = len(set([w.text.lower() for s in self.pos_content_sentences for w in s]))
self.N_dcw =N_dcw
# Number of sentences
self.N_s = len(self.pos_content_sentences)
# Number of total content words
N_cw = reduce((lambda x, y: x + y), [len(s) for s in self.pos_content_sentences])
self.N_cw = N_cw
# Lexical Distribution Index
if self.N_s == 0:
self.N_s = 1
LDI = N_dcw / float(self.N_s)
self.LDI = LDI
# Index of Low Frequency Words
if N_cw == 0:
N_cw = 1
ILFW = N_lfw / float(N_cw)
self.ILFW =ILFW
# Lexical Complexity
LC = (LDI + ILFW) / 2
self.LC = LC
return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC
def ssReadability(self):
#Number of rare words
byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True)
byfreq = byfreq[:1500]
count = 0
for sentence in self.pos_content_sentences:
for w in sentence:
if w.text.lower() not in byfreq:
count +=1
N_rw = count
self.N_rw = N_rw
SSR = 1.609*(self.N_words / self.N_s) + 331.8* (self.N_rw /self.N_words) + 22.0
self.SSR= SSR
return self.N_rw, self.SSR
def sentenceComplexity(self):
#Number of complex sentences
N_cs = 0
for sentence in self.sentences:
previous_is_verb = False
count = 0
for w in sentence:
if re.match('VERB.*', w.pos_):
if (previous_is_verb):
count += 1
previous_is_verb = False
else:
previous_is_verb = True
else:
previous_is_verb = False
if count>0:
N_cs += 1
self.N_cs = N_cs
ASL = self.N_words / self.N_s
self.ASL = ASL
CS = self.N_cs / self.N_s
self.CS = CS
SCI = (ASL + CS)/ 2
self.SCI = SCI
return self.N_cs, self.ASL, self.CS, self.SCI
def autoReadability(self):
# Number of characters
count = 0
listwords = []
for words in self.sentences:
for w in words:
if re.match('\r\n.*', w.text):
count +=1
else:
listwords.append(w.text)
self.listwords = listwords
N_charac = 0
for characters in self.listwords:
N_charac += len(characters)
self.N_charac = N_charac
ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words/ self.N_s - 21.43
self.ARI = ARI
return self.N_charac, self.ARI, self.listwords
def tree_height(self,root, cont):
if not list(root.children):
return 1
else:
cont+=1
if cont == 320:
return 320
return 1 + max(self.tree_height(x, cont) for x in root.children)
def embeddingDepth(self):
## Output results
roots = [sent.root for sent in self.sentences]
max_list = []
max_list = [self.tree_height(root,0) for root in roots]
mean_max_list = sum(max_list)/(len(max_list))
max_max_list = max(max_list)
min_max_list = min(max_list)
self.max_max_list = max_max_list
self.min_max_list = min_max_list
self.mean_max_list = mean_max_list
return self.max_max_list, self.min_max_list, self.mean_max_list
def syllable_counter_spanish(self,text):
if self.lang == 'es':
t = re.sub(r'y([aáeéiíoóuú])', '\\1', text.lower())
t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())
t = re.sub(r'[iu][aáeyéioóu]', 'A', t).lower()
t = re.sub(r'[aáeéiíoóuúy]', 'A', t)
return(len(t.split('A'))-1)
elif self.lang == 'en':
return syllables.estimate(text)
def readability(self):
# Number of syllables and Number of words with 3 or more syllables:tagger
n_syllables = 0
n_syllables3 = 0
for words in self.listwords:
syllables = self.syllable_counter_spanish(words)
n_syllables += syllables
if syllables>=3:
n_syllables3 += 1
self.n_syllables = n_syllables
self.n_syllables3 = n_syllables3
# Number of letters
nletters= 0
letters = []
vecletters =[]
for word in self.listwords:
if re.match('[a-zA-Z]|á|ó|í|ú|é', word):
letters.append(word)
nletters+=len(word)
vecletters.append(len(word))
self.letters = letters
self.nletters = nletters
self.vecletters= vecletters
huertareadability = 206.835 - 60 * (self.n_syllables / self.N_words) - 102 * (self.nsentences / self.N_words)
self.huertareadability = huertareadability
ifszreadability = 206.835 - 62.3 * (self.n_syllables / self.N_words) - (self.N_words / self.nsentences)
self.ifszreadability = ifszreadability
self.syll_words = self.n_syllables / self.N_words
polinicompressibility = 95.2 - 9.7 * (self.nletters / self.N_words) - 0.35 * (self.N_words / self.nsentences)
self.polinicompressibility = polinicompressibility
self.words_sen = self.N_words / self.nsentences
x=self.nletters / self.N_words
varianza=np.var(self.vecletters)
if varianza == 0:
varianza =1
aux = self.N_words-1
if aux == 0:
aux=1
mureadability = (self.N_words /aux)*(x/varianza)*100
self.mureadability = mureadability
return self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen
def ageReadability(self):
minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395
self.minimumage = minimumage
solreadability= -2.51+0.74*(3.1291+1.0430*math.sqrt(self.n_syllables3*(30/self.nsentences)))
self.solreadability = solreadability
return self.minimumage, self.solreadability
def yearsCrawford(self):
years = -20.5 *(self.nsentences/self.N_words) + 4.9 * (self.n_syllables /self.N_words) - 3.407
self.years = years
return self.years
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment