complexity analyzer refactored

parent 6330f677
The file could not be displayed because it is too large.
......@@ -7,7 +7,7 @@ import numpy as np
from tqdm import tqdm
import re
import pandas as pd
from TextAnalysisSpacy.TextComplexitySpacy import TextComplexitySpacy
from texty.complexity import ComplexityAnalyzer
import matplotlib.pyplot as plt
#%matplotlib inline ## when in Jupyter
......@@ -18,7 +18,7 @@ from nltk.text import Text
from lexical_diversity import lex_div as ld
from transformers import pipeline
class TextAnalysisSpacy():
class Analyzer():
def __init__(self, lang='es'):
......@@ -26,15 +26,14 @@ class TextAnalysisSpacy():
if lang == 'es':
spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm")
self.textComplexitySpacy = TextComplexitySpacy()
elif lang == 'en':
spacy.cli.download("en_core_web_sm")
self.nlp = spacy.load("en_core_web_sm")
self.textComplexitySpacy = TextComplexitySpacy('en')
self.complexity_analyzer = ComplexityAnalyzer(self.nlp)
self.Text = Text
self.FreqDist = FreqDist
self.POS_LIST = ["ADJ", "ADP", "ADV", "AUX","X", "CCONJ","CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "SPACE"]
pass
#
# X = samples input , y = tags
......
from functools import reduce
from tkinter.font import _MetricsDict
import spacy
import math
import syllables
import os
import re
crea_total_path = './CREA_total.txt'
crea_total_path = os.path.join(os.path.dirname(__file__), 'CREA_total.txt')
class TextComplexitySpacy():
class ComplexityAnalyzer():
def __init__(self, lang='es'):
def __init__(self, lang, spacy_nlp):
self.lang = lang
self.nlp = spacy_nlp
# create language analyzer
if lang == 'es':
self.nlp = spacy.load("es_core_news_sm")
# Load CREA
with open(crea_total_path) as f:
lines = f.readlines()
f.close()
crea = {}
for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words
data = l.strip().split()
crea[data[1]] = float(data[2].replace(',', ''))
self.lang_word_freqs = crea
if lang == 'en':
self.nlp = spacy.load("en_core_web_sm")
# Para leer el texto que introducimos
f = open(crea_total_path)
lines = f.readlines()
f.close()
crea = {}
for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words
data = l.strip().split()
crea[data[1]] = float(data[2].replace(',', ''))
self.crea = crea
pass
self.lang_word_freqs = {}
def textProcessing(self, text):
def read(self, text):
# Meter todas las funciones en una patron de los tokens válidos
doc = self.nlp(text)
self.tokens = [w for w in doc]
......@@ -43,9 +43,45 @@ class TextComplexitySpacy():
ws = self.nlp(sentence.text)
pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)])
self.pos_content_sentences = pos_content_sentences
return self.pos_content_sentences
def get_all_metrics(self):
self.punctuationMarks()
self.lexicalComplexity()
self.ssReadability()
self.sentenceComplexity()
self.autoReadability()
self.embeddingDepth()
self.readability()
self.ageReadability()
self.yearsCrawford()
metrics = {
'npunct': self.npunctuation, # number of punctuation marks
'nword': self.N_words, # number of non punctiation tokens (words)
'ILFW': self.ILFW, # index of low frequency words
'LDI': self.LDI, # lexical diversity index
'LC': self.LC, # lexical complexity index
'nrword': self.N_rw, # number of rare words
'SSR': self.SSR, # Spaulding's readability score
'avgsentl': self.ASL, # average sentences length
'ncompsent': self.N_cs, # number of complex sentences (those with composed verbs)
'nsent': self.N_s, # number of sentences
'SCI': self.SCI, # sentence complexity index
'nchar': self.N_charac, # number of characters
'ARI': self.ARI, # auto readability index
'min_depth': self.min_max_list, # minimum of maximum tree depths
'max_depth': self.max_max_list, # maximum of maximum tree depths
'mean_depth': self.mean_max_list, # mean of maximum tree depths
'nsyllab': self.n_syllables, # number of syllables
'huerta': self.huertareadability, # Huerta's readability
'IFSZ': self.ifszreadability, # Flesch-Szigrist legibility
'polini': self.polinicompressibility, # Polini's compressibility
'mu': self.mureadability, # Mu readability
'minage': self.minimumage, # minimum age
'SOL': self.solreadability, # SOL readability
'crawford': self.years # Crawford's years
}
return metrics
def punctuationMarks(self):
# Solo nos interesa contar los tokens que sean signo de puntuación.
......@@ -108,6 +144,9 @@ class TextComplexitySpacy():
return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC
def ssReadability(self):
'''
Spaulding Score of Readability
'''
#Number of rare words
byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True)
byfreq = byfreq[:1500]
......@@ -202,7 +241,7 @@ class TextComplexitySpacy():
return self.max_max_list, self.min_max_list, self.mean_max_list
def syllable_counter_spanish(self,text):
def syllable_counter_spanish(self, text):
if self.lang == 'es':
t = re.sub(r'y([aáeéiíoóuú])', '\\1', text.lower())
t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())
......@@ -265,7 +304,7 @@ class TextComplexitySpacy():
self.mureadability = mureadability
return self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen
def ageReadability(self):
minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment