complexity analyzer refactored

e27fddf2 · Arturo Montejo Ráez · 6330f677 · e27fddf2 · e27fddf2 · e27fddf2
Commit e27fddf2 authored Mar 05, 2022 by Arturo Montejo Ráez
Showing with 66 additions and 28 deletions
example.ipynb → examples/example.ipynb
texts.csv → examples/texts.csv
CREA_total.txt → src/texty/CREA_total.txt
__init__.py → src/texty/__init__.py
TextAnalysisSpacy.py → src/texty/analyzer.py
TextComplexitySpacy.py → src/texty/complexity.py
--- a/example.ipynb
+++ b/example.ipynb
--- a/texts.csv
+++ b/texts.csv
--- a/CREA_total.txt
+++ b/CREA_total.txt
--- a/__init__.py
+++ b/__init__.py
--- a/TextAnalysisSpacy.py
+++ b/TextAnalysisSpacy.py
@@ -7,7 +7,7 @@ import numpy as np
 from tqdm import tqdm
 import re
 import pandas as pd
-from TextAnalysisSpacy.TextComplexitySpacy import TextComplexitySpacy
+from texty.complexity import ComplexityAnalyzer

 import matplotlib.pyplot as plt
 #%matplotlib inline ## when in Jupyter
@@ -18,7 +18,7 @@ from nltk.text import Text
 from lexical_diversity import lex_div as ld
 from transformers import pipeline

-class TextAnalysisSpacy():
+class Analyzer():
            
    def __init__(self, lang='es'):

@@ -26,15 +26,14 @@ class TextAnalysisSpacy():
      if lang == 'es':
        spacy.cli.download("es_core_news_sm")
        self.nlp = spacy.load("es_core_news_sm")
-        self.textComplexitySpacy = TextComplexitySpacy()
      elif lang == 'en':
+        spacy.cli.download("en_core_web_sm")
        self.nlp = spacy.load("en_core_web_sm")
-        self.textComplexitySpacy = TextComplexitySpacy('en')
-
+      
+      self.complexity_analyzer = ComplexityAnalyzer(self.nlp)
      self.Text = Text
      self.FreqDist = FreqDist
      self.POS_LIST = ["ADJ", "ADP", "ADV", "AUX","X", "CCONJ","CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "SPACE"]
-    pass

    # 
    # X = samples input , y = tags

--- a/TextComplexitySpacy.py
+++ b/TextComplexitySpacy.py
 from functools import reduce
+from tkinter.font import _MetricsDict
 import spacy
 import math
 import syllables
+import os
+import re

-crea_total_path = './CREA_total.txt'
+crea_total_path = os.path.join(os.path.dirname(__file__), 'CREA_total.txt')  

-
-class TextComplexitySpacy():
+class ComplexityAnalyzer():
            
-    def __init__(self, lang='es'):
+    def __init__(self, lang, spacy_nlp):
        self.lang = lang
+        self.nlp = spacy_nlp

        # create language analyzer
        if lang == 'es':
-          self.nlp = spacy.load("es_core_news_sm")
+            # Load CREA
+            with open(crea_total_path) as f:
+                lines = f.readlines()
+                f.close()
+            crea = {}
+            for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words
+                data = l.strip().split()
+                crea[data[1]] = float(data[2].replace(',', ''))
+            self.lang_word_freqs = crea
        if lang == 'en':
-          self.nlp = spacy.load("en_core_web_sm")
-
-        # Para leer el texto que introducimos
-        f = open(crea_total_path)
-        lines = f.readlines()
-        f.close()
-        crea = {}
-        for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words
-            data = l.strip().split()
-            crea[data[1]] = float(data[2].replace(',', ''))
-        self.crea = crea
-    pass
+          self.lang_word_freqs = {}

-    def textProcessing(self, text):
+    def read(self, text):
        # Meter todas las funciones en una patron de los tokens válidos
        doc = self.nlp(text)
        self.tokens = [w for w in doc]
@@ -43,9 +43,45 @@ class TextComplexitySpacy():
            ws = self.nlp(sentence.text)
            pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)])
        self.pos_content_sentences = pos_content_sentences 
-           
-        return self.pos_content_sentences

+    def get_all_metrics(self):
+        self.punctuationMarks()
+        self.lexicalComplexity()
+        self.ssReadability()
+        self.sentenceComplexity()
+        self.autoReadability()
+        self.embeddingDepth()
+        self.readability()
+        self.ageReadability()
+        self.yearsCrawford()
+
+        metrics = {
+            'npunct': self.npunctuation,        # number of punctuation marks
+            'nword': self.N_words,              # number of non punctiation tokens (words)
+            'ILFW': self.ILFW,                  # index of low frequency words
+            'LDI': self.LDI,                    # lexical diversity index
+            'LC': self.LC,                      # lexical complexity index
+            'nrword': self.N_rw,                # number of rare words
+            'SSR': self.SSR,                    # Spaulding's readability score
+            'avgsentl': self.ASL,               # average sentences length
+            'ncompsent': self.N_cs,             # number of complex sentences (those with composed verbs)
+            'nsent': self.N_s,                  # number of sentences
+            'SCI': self.SCI,                    # sentence complexity index
+            'nchar': self.N_charac,             # number of characters
+            'ARI': self.ARI,                    # auto readability index
+            'min_depth': self.min_max_list,     # minimum of maximum tree depths
+            'max_depth': self.max_max_list,     # maximum of maximum tree depths
+            'mean_depth': self.mean_max_list,   # mean of maximum tree depths
+            'nsyllab': self.n_syllables,        # number of syllables
+            'huerta': self.huertareadability,   # Huerta's readability
+            'IFSZ': self.ifszreadability,       # Flesch-Szigrist legibility
+            'polini': self.polinicompressibility, # Polini's compressibility
+            'mu': self.mureadability,           # Mu readability
+            'minage': self.minimumage,          # minimum age
+            'SOL': self.solreadability,         # SOL readability
+            'crawford': self.years              # Crawford's years
+        }
+        return metrics
    
    def punctuationMarks(self):
        # Solo nos interesa contar los tokens que sean signo de puntuación.
@@ -108,6 +144,9 @@ class TextComplexitySpacy():
        return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC

    def ssReadability(self): 
+        ''' 
+        Spaulding Score of Readability
+        '''
        #Number of rare words
        byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True)
        byfreq = byfreq[:1500]
@@ -202,7 +241,7 @@ class TextComplexitySpacy():
        
        return self.max_max_list,  self.min_max_list, self.mean_max_list

-    def syllable_counter_spanish(self,text):
+    def syllable_counter_spanish(self, text):
      if self.lang == 'es':
        t = re.sub(r'y([aáeéiíoóuú])', '\\1', text.lower())
        t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())
@@ -265,7 +304,7 @@ class TextComplexitySpacy():
        self.mureadability = mureadability
    
        return  self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen
-  
+
    def ageReadability(self):
        
        minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395