complexity analyzer refactored

parent 6330f677
The file could not be displayed because it is too large.
...@@ -7,7 +7,7 @@ import numpy as np ...@@ -7,7 +7,7 @@ import numpy as np
from tqdm import tqdm from tqdm import tqdm
import re import re
import pandas as pd import pandas as pd
from TextAnalysisSpacy.TextComplexitySpacy import TextComplexitySpacy from texty.complexity import ComplexityAnalyzer
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
#%matplotlib inline ## when in Jupyter #%matplotlib inline ## when in Jupyter
...@@ -18,7 +18,7 @@ from nltk.text import Text ...@@ -18,7 +18,7 @@ from nltk.text import Text
from lexical_diversity import lex_div as ld from lexical_diversity import lex_div as ld
from transformers import pipeline from transformers import pipeline
class TextAnalysisSpacy(): class Analyzer():
def __init__(self, lang='es'): def __init__(self, lang='es'):
...@@ -26,15 +26,14 @@ class TextAnalysisSpacy(): ...@@ -26,15 +26,14 @@ class TextAnalysisSpacy():
if lang == 'es': if lang == 'es':
spacy.cli.download("es_core_news_sm") spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm") self.nlp = spacy.load("es_core_news_sm")
self.textComplexitySpacy = TextComplexitySpacy()
elif lang == 'en': elif lang == 'en':
spacy.cli.download("en_core_web_sm")
self.nlp = spacy.load("en_core_web_sm") self.nlp = spacy.load("en_core_web_sm")
self.textComplexitySpacy = TextComplexitySpacy('en')
self.complexity_analyzer = ComplexityAnalyzer(self.nlp)
self.Text = Text self.Text = Text
self.FreqDist = FreqDist self.FreqDist = FreqDist
self.POS_LIST = ["ADJ", "ADP", "ADV", "AUX","X", "CCONJ","CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "SPACE"] self.POS_LIST = ["ADJ", "ADP", "ADV", "AUX","X", "CCONJ","CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "SPACE"]
pass
# #
# X = samples input , y = tags # X = samples input , y = tags
......
from functools import reduce from functools import reduce
from tkinter.font import _MetricsDict
import spacy import spacy
import math import math
import syllables import syllables
import os
import re
crea_total_path = './CREA_total.txt' crea_total_path = os.path.join(os.path.dirname(__file__), 'CREA_total.txt')
class ComplexityAnalyzer():
class TextComplexitySpacy():
def __init__(self, lang='es'): def __init__(self, lang, spacy_nlp):
self.lang = lang self.lang = lang
self.nlp = spacy_nlp
# create language analyzer # create language analyzer
if lang == 'es': if lang == 'es':
self.nlp = spacy.load("es_core_news_sm") # Load CREA
with open(crea_total_path) as f:
lines = f.readlines()
f.close()
crea = {}
for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words
data = l.strip().split()
crea[data[1]] = float(data[2].replace(',', ''))
self.lang_word_freqs = crea
if lang == 'en': if lang == 'en':
self.nlp = spacy.load("en_core_web_sm") self.lang_word_freqs = {}
# Para leer el texto que introducimos
f = open(crea_total_path)
lines = f.readlines()
f.close()
crea = {}
for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words
data = l.strip().split()
crea[data[1]] = float(data[2].replace(',', ''))
self.crea = crea
pass
def textProcessing(self, text): def read(self, text):
# Meter todas las funciones en una patron de los tokens válidos # Meter todas las funciones en una patron de los tokens válidos
doc = self.nlp(text) doc = self.nlp(text)
self.tokens = [w for w in doc] self.tokens = [w for w in doc]
...@@ -43,9 +43,45 @@ class TextComplexitySpacy(): ...@@ -43,9 +43,45 @@ class TextComplexitySpacy():
ws = self.nlp(sentence.text) ws = self.nlp(sentence.text)
pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)]) pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)])
self.pos_content_sentences = pos_content_sentences self.pos_content_sentences = pos_content_sentences
return self.pos_content_sentences
def get_all_metrics(self):
self.punctuationMarks()
self.lexicalComplexity()
self.ssReadability()
self.sentenceComplexity()
self.autoReadability()
self.embeddingDepth()
self.readability()
self.ageReadability()
self.yearsCrawford()
metrics = {
'npunct': self.npunctuation, # number of punctuation marks
'nword': self.N_words, # number of non punctiation tokens (words)
'ILFW': self.ILFW, # index of low frequency words
'LDI': self.LDI, # lexical diversity index
'LC': self.LC, # lexical complexity index
'nrword': self.N_rw, # number of rare words
'SSR': self.SSR, # Spaulding's readability score
'avgsentl': self.ASL, # average sentences length
'ncompsent': self.N_cs, # number of complex sentences (those with composed verbs)
'nsent': self.N_s, # number of sentences
'SCI': self.SCI, # sentence complexity index
'nchar': self.N_charac, # number of characters
'ARI': self.ARI, # auto readability index
'min_depth': self.min_max_list, # minimum of maximum tree depths
'max_depth': self.max_max_list, # maximum of maximum tree depths
'mean_depth': self.mean_max_list, # mean of maximum tree depths
'nsyllab': self.n_syllables, # number of syllables
'huerta': self.huertareadability, # Huerta's readability
'IFSZ': self.ifszreadability, # Flesch-Szigrist legibility
'polini': self.polinicompressibility, # Polini's compressibility
'mu': self.mureadability, # Mu readability
'minage': self.minimumage, # minimum age
'SOL': self.solreadability, # SOL readability
'crawford': self.years # Crawford's years
}
return metrics
def punctuationMarks(self): def punctuationMarks(self):
# Solo nos interesa contar los tokens que sean signo de puntuación. # Solo nos interesa contar los tokens que sean signo de puntuación.
...@@ -108,6 +144,9 @@ class TextComplexitySpacy(): ...@@ -108,6 +144,9 @@ class TextComplexitySpacy():
return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC
def ssReadability(self): def ssReadability(self):
'''
Spaulding Score of Readability
'''
#Number of rare words #Number of rare words
byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True) byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True)
byfreq = byfreq[:1500] byfreq = byfreq[:1500]
...@@ -202,7 +241,7 @@ class TextComplexitySpacy(): ...@@ -202,7 +241,7 @@ class TextComplexitySpacy():
return self.max_max_list, self.min_max_list, self.mean_max_list return self.max_max_list, self.min_max_list, self.mean_max_list
def syllable_counter_spanish(self,text): def syllable_counter_spanish(self, text):
if self.lang == 'es': if self.lang == 'es':
t = re.sub(r'y([aáeéiíoóuú])', '\\1', text.lower()) t = re.sub(r'y([aáeéiíoóuú])', '\\1', text.lower())
t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower()) t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())
...@@ -265,7 +304,7 @@ class TextComplexitySpacy(): ...@@ -265,7 +304,7 @@ class TextComplexitySpacy():
self.mureadability = mureadability self.mureadability = mureadability
return self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen return self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen
def ageReadability(self): def ageReadability(self):
minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395 minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment