Add TextComplexitySpacy

parent 1e209b7e
Showing with 309 additions and 0 deletions
import spacy as sp
import spacy.cli
spacy.cli.download("es_core_news_sm")
import es_core_news_sm
import os
import re
from functools import reduce
import numpy as np
import scipy.stats
import math
class TextComplexitySpacy():
def __init__(self, lang='es'):
self.lang = lang
# Create language analyzer
self.nlp = es_core_news_sm.load()
# To read the text we enter
f = open('./CREA_total.txt')
lines = f.readlines()
f.close()
crea = {}
for l in lines[1:1000]: # Those words not in the 1000 most frequent words in CREA are low frequency words
data = l.strip().split()
crea[data[1]] = float(data[2].replace(',', ''))
self.crea = crea
pass
def textProcessing(self, text):
# Put all functions in a pattern of valid tokens
doc = self.nlp(text)
self.tokens = [w for w in doc]
self.sentences = [sent for sent in doc.sents]
self.nsentences = len(self.sentences)
print('Las oraciones: ', self.sentences)
'''
Filter out tokens that are not adjectives, verbs or nouns.
'''
pos_content_sentences = []
for sentence in self.sentences:
ws = self.nlp(sentence.text)
pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)])
self.pos_content_sentences = pos_content_sentences
return self.pos_content_sentences
def punctuationMarks(self):
# We are only interested in counting tokens that are punctuation marks.
# Number of words.
punctuation = []
N_words = []
for w in self.tokens:
if re.match('PUNCT.*', w.pos_):
punctuation.append(w.text)
else:
N_words.append(w.text)
print('Las palabras del texto son : ', N_words)
aux = len(N_words)
if aux == 0:
aux = 1
self.N_words = aux
print('Number of words (N_words): ', self.N_words, '\n' )
self.npunctuation = len(punctuation)
self.punctuation = punctuation
print("PUNCTUATION MARKS = ", self.npunctuation,'\n')
return self.npunctuation, self.punctuation, self.N_words
def lexicalComplexity(self):
# Number of low frequency words
count = 0
for sentence in self.pos_content_sentences:
for w in sentence:
if w.text not in self.crea:
count+=1
N_lfw = count
self.N_lfw = N_lfw
print("Number of low frequency words (N_lfw): ", self.N_lfw, "\n")
# Number of distinct content words
N_dcw = len(set([w.text.lower() for s in self.pos_content_sentences for w in s]))
self.N_dcw =N_dcw
print('Number of distinct content words (N_dcw) = ', self.N_dcw, '\n')
# Number of sentences
self.N_s = len(self.pos_content_sentences)
print("Number os sentences (N_s): ", self.N_s, "\n")
# Number of total content words
N_cw = reduce((lambda x, y: x + y), [len(s) for s in self.pos_content_sentences])
self.N_cw = N_cw
print("Number of total content words (N_cw): ", self.N_cw, "\n")
# Lexical Distribution Index
if self.N_s == 0:
self.N_s = 1
LDI = N_dcw / float(self.N_s)
self.LDI = LDI
print("Lexical Distribution Index (LDI) = ", self.LDI, '\n')
# Index of Low Frequency Words
if N_cw == 0:
N_cw = 1
ILFW = N_lfw / float(N_cw)
self.ILFW =ILFW
print("Index Low Frequency Words (ILFW) = ", self.ILFW, '\n')
# Lexical Complexity
LC = (LDI + ILFW) / 2
self.LC = LC
print ("LEXICAL COMPLEXITY INDEX (LC) =", LC, "\n")
return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC
def ssReadability(self):
# Number of rare words
byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True)
byfreq = byfreq[:1500]
count = 0
for sentence in self.pos_content_sentences:
for w in sentence:
if w.text.lower() not in byfreq:
count +=1
N_rw = count
self.N_rw = N_rw
print("Number of rare words (N_rw): ", self.N_rw, "\n")
SSR = 1.609*(self.N_words / self.N_s) + 331.8* (self.N_rw /self.N_words) + 22.0
self.SSR= SSR
print ("SPAULDING SPANISH READABILITY (SSR) ", self.SSR, "\n")
return self.N_rw, self.SSR
def sentenceComplexity(self):
# Number of complex sentences
N_cs = 0
for sentence in self.sentences:
previous_is_verb = False
count = 0
for w in sentence:
if re.match('VERB.*', w.pos_):
if (previous_is_verb):
count += 1
previous_is_verb = False
else:
previous_is_verb = True
else:
previous_is_verb = False
if count>0:
N_cs += 1
self.N_cs = N_cs
print("Number of complex sentences: ", self.N_cs, "\n")
ASL = self.N_words / self.N_s
self.ASL = ASL
print("Average Sentence Length (ASL) = ", self.ASL, '\n')
CS = self.N_cs / self.N_s
self.CS = CS
print("Complex Sentences (CS) = ", self.CS, '\n')
SCI = (ASL + CS)/ 2
self.SCI = SCI
print("SENTENCE COMPLEXITY INDEX:(SCI) = ", self.SCI, "\n")
return self.N_cs, self.ASL, self.CS, self.SCI
def autoReadability(self):
# Number of characters
count = 0
listwords = []
for words in self.sentences:
for w in words:
if re.match('\r\n.*', w.text):
count +=1
else:
listwords.append(w.text)
self.listwords = listwords
N_charac = 0
for characters in self.listwords:
N_charac += len(characters)
self.N_charac = N_charac
print("Number of characters: ", self.N_charac, "\n")
ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words/ self.N_s - 21.43
self.ARI = ARI
print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
return self.N_charac, self.ARI, self.listwords
def tree_height(self,root):
if not list(root.children):
return 1
else:
return 1 + max(self.tree_height(x) for x in root.children)
def embeddingDepth(self):
## output results
roots = [sent.root for sent in self.sentences]
max_list = []
max_list = [self.tree_height(root) for root in roots]
mean_max_list = sum(max_list)/(len(max_list))
max_max_list = max(max_list)
min_max_list = min(max_list)
print('MAXIMUN EMBEDDING DEPTH OF SENTENCE (MaxDEPTH): ', max_max_list, '\n')
print('MINIMUN EMBEDDING DEPTH OF SENTENCE (MinDEPTH): ', min_max_list, '\n')
print('AVERAGE EMBEDDING DEPTH OF SENTENCE (MeanDEPTH): ', mean_max_list, '\n')
self.max_max_list = max_max_list
self.min_max_list = min_max_list
self.mean_max_list = mean_max_list
return self.max_max_list, self.min_max_list, self.mean_max_list
def syllable_counter_spanish(self,text):
t = re.sub(r'y([aáeéiíoóuú])', '\\1', text.lower())
t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())
t = re.sub(r'[iu][aáeyéioóu]', 'A', t).lower()
t = re.sub(r'[aáeéiíoóuúy]', 'A', t)
return(len(t.split('A'))-1)
def readability(self):
# Number of syllables and Number of words with 3 or more syllables:tagger
n_syllables = 0
n_syllables3 = 0
for words in self.listwords:
syllables = self.syllable_counter_spanish(words)
n_syllables += syllables
if syllables >= 3:
n_syllables3 += 1
self.n_syllables = n_syllables
self.n_syllables3 = n_syllables3
# Number of letters
nletters= 0
letters = []
vecletters =[]
for word in self.listwords:
if re.match('[a-zA-Z]|á|ó|í|ú|é', word):
letters.append(word)
nletters+=len(word)
vecletters.append(len(word))
self.letters = letters
self.nletters = nletters
self.vecletters= vecletters
huertareadability = 206.835 - 60 * (self.n_syllables / self.N_words) - 102 * (self.nsentences / self.N_words)
print("THE READABILITY OF HUERTA: ", huertareadability, "\n")
self.huertareadability = huertareadability
ifszreadability = 206.835 - 62.3 * (self.n_syllables / self.N_words) - (self.N_words / self.nsentences)
print("THE READABILITY IFSZ: ", ifszreadability, "\n")
self.ifszreadability = ifszreadability
self.syll_words = self.n_syllables / self.N_words
polinicompressibility = 95.2 - 9.7 * (self.nletters / self.N_words) - 0.35 * (self.N_words / self.nsentences)
print("THE COMPRESSIBILITY OF GUTIÉRREZ POLINI: ", polinicompressibility, "\n")
self.polinicompressibility = polinicompressibility
self.words_sen = self.N_words / self.nsentences
x=self.nletters / self.N_words
varianza=np.var(self.vecletters)
if varianza == 0:
varianza =1
aux = self.N_words - 1
if aux == 0:
aux = 1
mureadability = (self.N_words /aux)*(x/varianza)*100
print("READABILITY MU: ", mureadability, "\n")
self.mureadability = mureadability
return self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen
def ageReadability(self):
minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395
print("MINIMUM AGE TO UNDERSTAND A TEXT: ", minimumage, "\n")
self.minimumage = minimumage
solreadability= -2.51+0.74*(3.1291+1.0430*math.sqrt(self.n_syllables3*(30/self.nsentences)))
print("THE READABILITY SOL: ", solreadability, "\n")
self.solreadability = solreadability
return self.minimumage, self.solreadability
def yearsCrawford(self):
years = -20.5 *(self.nsentences/self.N_words) + 4.9 * (self.n_syllables /self.N_words) - 3.407
print("YEARS NEEDED: ", years, "\n")
self.years = years
return self.years
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment