Add TextComplexitySpacy

c3a3133a · Alba Maria Mármol · 1e209b7e · c3a3133a
Commit c3a3133a authored Jan 24, 2022 by Alba Maria Mármol
Showing with 309 additions and 0 deletions
TextComplexitySpacy.py
--- a/TextComplexitySpacy.py
+++ b/TextComplexitySpacy.py
+import spacy as sp
+import spacy.cli 
+spacy.cli.download("es_core_news_sm")
+import es_core_news_sm
+import os
+import re 
+from functools import reduce
+import numpy as np
+import scipy.stats
+import math
+class TextComplexitySpacy():
+    def __init__(self, lang='es'):
+        self.lang = lang
+        # Create language analyzer
+        self.nlp = es_core_news_sm.load()
+        # To read the text we enter
+        f = open('./CREA_total.txt')
+        lines = f.readlines()
+        f.close()
+        crea = {}
+        for l in lines[1:1000]: # Those words not in the 1000 most frequent words in CREA are low frequency words
+            data = l.strip().split()
+            crea[data[1]] = float(data[2].replace(',', ''))
+        self.crea = crea
+    pass
+    def textProcessing(self, text):
+        # Put all functions in a pattern of valid tokens
+        doc = self.nlp(text)
+        self.tokens = [w for w in doc]
+        self.sentences = [sent for sent in doc.sents]
+        self.nsentences = len(self.sentences)
+        print('Las oraciones: ', self.sentences)
+        '''
+        Filter out tokens that are not adjectives, verbs or nouns.
+        '''
+        pos_content_sentences = []
+        for sentence in self.sentences:
+            ws = self.nlp(sentence.text)
+            pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)])
+        self.pos_content_sentences = pos_content_sentences 
+        return self.pos_content_sentences
+    def punctuationMarks(self):
+        # We are only interested in counting tokens that are punctuation marks.
+        # Number of words.
+        punctuation = []
+        N_words = []
+        for w in self.tokens:
+            if re.match('PUNCT.*', w.pos_):
+                punctuation.append(w.text)
+            else:
+                N_words.append(w.text)
+        print('Las palabras del texto son : ', N_words)
+        aux = len(N_words) 
+        if aux == 0:
+          aux = 1
+        self.N_words = aux
+        print('Number of words (N_words): ', self.N_words, '\n' )
+        self.npunctuation = len(punctuation)
+        self.punctuation = punctuation
+        print("PUNCTUATION MARKS = ", self.npunctuation,'\n')
+        return self.npunctuation, self.punctuation, self.N_words
+    def lexicalComplexity(self):
+        # Number of low frequency words   
+        count = 0
+        for sentence in self.pos_content_sentences:
+            for w in sentence:
+                if w.text not in self.crea:
+                    count+=1
+        N_lfw = count
+        self.N_lfw = N_lfw
+        print("Number of low frequency words (N_lfw): ", self.N_lfw, "\n")
+        # Number of distinct content words 
+        N_dcw = len(set([w.text.lower() for s in self.pos_content_sentences for w in s]))
+        self.N_dcw =N_dcw
+        print('Number of distinct content words (N_dcw) = ', self.N_dcw, '\n')
+        # Number of sentences
+        self.N_s = len(self.pos_content_sentences)
+        print("Number os sentences (N_s): ", self.N_s, "\n")
+        # Number of total content words
+        N_cw = reduce((lambda x, y: x + y), [len(s) for s in self.pos_content_sentences])
+        self.N_cw = N_cw
+        print("Number of total content words (N_cw): ", self.N_cw, "\n")
+        # Lexical Distribution Index
+        if self.N_s == 0:
+          self.N_s = 1
+        LDI = N_dcw / float(self.N_s)
+        self.LDI = LDI
+        print("Lexical Distribution Index (LDI) = ", self.LDI, '\n')
+        # Index of Low Frequency Words
+        if N_cw == 0:
+          N_cw = 1
+        ILFW = N_lfw / float(N_cw)
+        self.ILFW =ILFW
+        print("Index Low Frequency Words (ILFW) = ", self.ILFW, '\n')
+        # Lexical Complexity
+        LC = (LDI + ILFW) / 2
+        self.LC = LC
+        print ("LEXICAL COMPLEXITY INDEX (LC) =", LC, "\n")
+        return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC
+    def ssReadability(self): 
+        # Number of rare words
+        byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True)
+        byfreq = byfreq[:1500]
+        count = 0
+        for sentence in self.pos_content_sentences:
+            for w in sentence:
+                if w.text.lower() not in byfreq:
+                    count +=1
+        N_rw = count
+        self.N_rw = N_rw
+        print("Number of rare words (N_rw): ", self.N_rw, "\n")
+        SSR = 1.609*(self.N_words / self.N_s) + 331.8* (self.N_rw /self.N_words) + 22.0 
+        self.SSR= SSR
+        print ("SPAULDING SPANISH READABILITY (SSR) ", self.SSR, "\n")
+        return self.N_rw, self.SSR
+    def sentenceComplexity(self):
+        # Number of complex sentences
+        N_cs = 0
+        for sentence in self.sentences:
+            previous_is_verb = False
+            count = 0
+            for w in sentence:
+                if re.match('VERB.*', w.pos_):
+                    if (previous_is_verb):
+                        count += 1
+                        previous_is_verb = False
+                    else:
+                        previous_is_verb = True
+                else:
+                    previous_is_verb = False
+            if count>0:
+                N_cs += 1   
+        self.N_cs = N_cs
+        print("Number of complex sentences: ", self.N_cs, "\n")
+        ASL = self.N_words / self.N_s
+        self.ASL = ASL
+        print("Average Sentence Length (ASL) = ", self.ASL, '\n')
+        CS = self.N_cs / self.N_s
+        self.CS = CS
+        print("Complex Sentences (CS) = ", self.CS, '\n')
+        SCI = (ASL + CS)/ 2
+        self.SCI = SCI
+        print("SENTENCE COMPLEXITY INDEX:(SCI) = ", self.SCI, "\n")
+        return self.N_cs, self.ASL, self.CS, self.SCI
+    def autoReadability(self):
+        # Number of characters
+        count = 0
+        listwords = []
+        for words in self.sentences:
+            for w in words:
+                if re.match('\r\n.*', w.text):
+                    count +=1
+                else:
+                    listwords.append(w.text)
+        self.listwords = listwords 
+        N_charac = 0
+        for characters in self.listwords:
+            N_charac += len(characters)
+        self.N_charac = N_charac
+        print("Number of characters: ", self.N_charac, "\n")
+        ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words/ self.N_s - 21.43
+        self.ARI = ARI
+        print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
+        return self.N_charac, self.ARI, self.listwords
+    def tree_height(self,root):
+        if not list(root.children):
+            return 1
+        else:
+            return 1 + max(self.tree_height(x) for x in root.children)
+    def embeddingDepth(self):
+        ## output results
+        roots = [sent.root for sent in self.sentences]
+        max_list = []
+        max_list = [self.tree_height(root) for root in roots]
+        mean_max_list = sum(max_list)/(len(max_list))
+        max_max_list = max(max_list)
+        min_max_list = min(max_list)
+        print('MAXIMUN EMBEDDING DEPTH OF SENTENCE (MaxDEPTH): ', max_max_list, '\n')
+        print('MINIMUN EMBEDDING DEPTH OF SENTENCE (MinDEPTH): ', min_max_list, '\n')
+        print('AVERAGE EMBEDDING DEPTH OF SENTENCE (MeanDEPTH): ', mean_max_list, '\n')
+        self.max_max_list = max_max_list
+        self.min_max_list = min_max_list
+        self.mean_max_list = mean_max_list
+        return self.max_max_list,  self.min_max_list, self.mean_max_list
+    def syllable_counter_spanish(self,text):
+        t = re.sub(r'y([aáeéiíoóuú])', '\\1', text.lower())
+        t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())
+        t = re.sub(r'[iu][aáeyéioóu]', 'A', t).lower()
+        t = re.sub(r'[aáeéiíoóuúy]', 'A', t)
+        return(len(t.split('A'))-1)
+    def readability(self):
+        # Number of syllables and Number of words with 3 or more syllables:tagger
+        n_syllables = 0
+        n_syllables3 = 0
+        for words in self.listwords:
+            syllables = self.syllable_counter_spanish(words)
+            n_syllables += syllables
+            if syllables >= 3:
+                n_syllables3 += 1
+        self.n_syllables = n_syllables
+        self.n_syllables3 = n_syllables3
+        # Number of letters
+        nletters= 0
+        letters = []
+        vecletters =[]
+        for word in self.listwords:
+                if re.match('[a-zA-Z]|á|ó|í|ú|é', word):
+                    letters.append(word)
+                    nletters+=len(word)
+                    vecletters.append(len(word))
+        self.letters = letters
+        self.nletters = nletters
+        self.vecletters= vecletters
+        huertareadability = 206.835 - 60 * (self.n_syllables / self.N_words)  - 102 * (self.nsentences / self.N_words)
+        print("THE READABILITY OF HUERTA: ", huertareadability, "\n")
+        self.huertareadability = huertareadability
+        ifszreadability = 206.835 - 62.3 * (self.n_syllables / self.N_words)  - (self.N_words  / self.nsentences) 
+        print("THE READABILITY IFSZ: ", ifszreadability, "\n")
+        self.ifszreadability = ifszreadability
+        self.syll_words = self.n_syllables / self.N_words
+        polinicompressibility = 95.2 - 9.7 * (self.nletters / self.N_words)  - 0.35 * (self.N_words / self.nsentences) 
+        print("THE COMPRESSIBILITY OF GUTIÉRREZ POLINI: ", polinicompressibility, "\n")
+        self.polinicompressibility = polinicompressibility
+        self.words_sen = self.N_words / self.nsentences
+        x=self.nletters / self.N_words
+        varianza=np.var(self.vecletters)
+        if varianza == 0:
+          varianza =1
+        aux = self.N_words - 1
+        if aux == 0:
+          aux = 1
+        mureadability = (self.N_words /aux)*(x/varianza)*100
+        print("READABILITY MU: ", mureadability, "\n")
+        self.mureadability = mureadability
+        return  self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen
+    def ageReadability(self):
+        minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395
+        print("MINIMUM AGE TO UNDERSTAND A TEXT: ", minimumage, "\n")
+        self.minimumage = minimumage
+        solreadability= -2.51+0.74*(3.1291+1.0430*math.sqrt(self.n_syllables3*(30/self.nsentences)))
+        print("THE READABILITY SOL: ", solreadability, "\n")
+        self.solreadability = solreadability
+        return self.minimumage, self.solreadability
+    def yearsCrawford(self):
+        years = -20.5 *(self.nsentences/self.N_words) + 4.9 * (self.n_syllables /self.N_words) - 3.407
+        print("YEARS NEEDED: ", years, "\n")
+        self.years = years
+        return self.years
\ No newline at end of file