Commit dbfb285b by Alba Maria Mármol

Merge remote-tracking branch 'origin/master'

parents d4ec88ef 0e30ea56
Showing with 410 additions and 1 deletions
# Only required for analysis in Spanish
import spacy.cli
spacy.cli.download("es_core_news_sm")
import es_core_news_sm
# Imports
import spacy
import numpy as np
from tqdm import tqdm
import re
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
import nltk
from nltk.probability import FreqDist
from nltk.text import Text
from lexical_diversity import lex_div as ld
class TextAnalisisSpacy():
def __init__(self, lang='es'):
# Create language analyzer
if lang == 'es':
self.nlp = es_core_news_sm.load()
self.textComplexitySpacy = TextComplexitySpacy()
self.nlp = es_core_news_sm.load()
elif lang == 'en':
self.nlp = spacy.load("en_core_web_sm")
self.textComplexitySpacy = TextComplexitySpacy('en')
self.Text = Text
self.FreqDist = FreqDist
self.POS_LIST = ["ADJ", "ADP", "ADV", "AUX","X", "CCONJ","CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "SPACE"]
pass
#
# X = samples input , y = tags
#
def textProcessing(self, X, y):
d = {'category':y, 'text':X}
self.df = pd.DataFrame(d)
# Replace gaps
self.df['text'].replace(np.nan,'', inplace=True)
print('Shape: ', self.df.shape)
# Create category dictionary
self.dic_categorias = {}
for i in range(len(df)):
if df.iloc[i,0] in self.dic_categorias:
self.dic_categorias[df.iloc[i,0]] += 1
else:
self.dic_categorias[df.iloc[i,0]] = 1
self.df_category = pd.DataFrame({'category': self.dic_categorias.keys()})
print('Dictionary of categories:', self.dic_categorias)
# Initialising variables for graphs
sb.set(rc={'figure.figsize':(14,6)})
all_values = self.dic_categorias.values()
self.max_value = max(all_values)
def showGraph(self, columnas, type_g='strip',export=False):
# Graph generator
for columna in columnas:
if (type_g == 'strip'):
splot = sb.stripplot(x=columna,y='category', data=self.df)
elif (type_g == 'box'):
splot = sb.boxplot(x=columna,y='category', data=self.df)
elif (type_g == 'heatmap'):
dic={}
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
dic[cat] = df_grupo[columna].tolist()
while len(dic[cat]) < self.max_value:
dic[cat].append(dic[cat][len(dic[cat])-1])
df_n = pd.DataFrame(dic)
splot = sb.heatmap(df_n.transpose()).set_title(columna)
if export == False:
plt.show()
else:
splot.get_figure().savefig(columna+"-"+ type_g+".jpg", bbox_inches='tight')
plt.clf()
def export(self):
print('Exporting...')
self.df.to_csv("data.csv")
self.df_category.to_csv("data_cat.csv")
self.showGraph(self.df.columns[2:],'strip',True)
self.showGraph(self.df.columns[2:],'box',True)
self.showGraph(self.df.columns[2:],'heatmap',True)
def volumetry(self):
# Volumetrics for each text
self.df['words'] = [len(text.split()) for text in self.df['text'].tolist()] # Number of words
self.df['uniques'] = [len(set(text.split())) for text in self.df['text'].tolist()] # Number of unique words
self.df['chars'] = self.df['text'].str.len() # Number of characters
self.df['avg_words_len'] = round(self.df['chars'] / self.df['words'], 3) # Average word length
self.df = self.df.replace([np.inf, -np.inf, np.nan], 0)
# Volumetrics for each category
volumetry = ['words','uniques','chars','avg_words_len']
category_columns = ['category','docs']
for col in volumetry:
category_columns.append('avg_'+col)
category_columns.append('std_'+col)
i = 0
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in volumetry:
self.df_category.loc[i,'docs'] = len(df_grupo)
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 5)
i+=1
print('Volumetrics for each text:')
display(self.df.head())
print('Volumetrics for each category:')
display(self.df_category[category_columns])
self.showGraph(volumetry,'strip')
self.showGraph(volumetry,'box')
self.showGraph(volumetry,'heatmap')
return self.df, self.df_category
def lemmas(self):
# Number and length of different lemmas per text
dic_lemmas = {}
for cat in self.dic_categorias:
dic_lemmas[cat] = []
i = 0
groups = self.df.groupby(self.df.category)
for cat in tqdm(self.dic_categorias):
df_grupo = groups.get_group(cat)
for text in df_grupo['text'].tolist():
set_ = set()
suma = 0
doc = self.nlp(text)
for token in doc:
set_.add(token.lemma_)
suma += len(token.lemma_)
if re.match('PUNCT.*|SYM.*|SPACE.*', token.pos_) == None:
dic_lemmas[cat].append(token.lemma_)
self.df.loc[i,'lemmas_uniques'] = len(set_)
if(len(set_) != 0):
self.df.loc[i,'avg_lemmas_len'] = round(suma / len(set_), 3)
else:
self.df.loc[i,'avg_lemmas_len'] = suma
i+=1
self.dic_lemmas = dic_lemmas
# Average and variance of different lemmas and length by category
i = 0
col_lemmas = ['lemmas_uniques','avg_lemmas_len']
category_lemmas = ['category']
for col in col_lemmas:
category_lemmas.append('avg_'+col)
category_lemmas.append('std_'+col)
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in col_lemmas:
self.df_category.loc[i,'docs'] = len(df_grupo)
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 3)
i+=1
print('Lemmas for each text:')
display(self.df.head())
print('Lemmas for each category:')
display(self.df_category[category_lemmas])
self.showGraph(col_lemmas,'strip')
self.showGraph(col_lemmas,'box')
self.showGraph(col_lemmas,'heatmap')
return self.df, self.df_category
def lemmas_freq(self, n = 50):
# Most frequent lemmas by category
dic_f_lemmas = self.dic_categorias.copy()
for cat in self.dic_categorias:
text = self.Text(self.dic_lemmas[cat])
dic_f_lemmas[cat] = self.FreqDist(text).most_common(n)
lista = []
for tupla in dic_f_lemmas[cat]:
lista.append((tupla[0], round(tupla[1] / len(self.dic_lemmas[cat]), 4)))
while (len(lista) < n): # Rellenar huecos
lista.append(np.nan)
dic_f_lemmas[cat] = lista
df_freq_lemas = pd.DataFrame(dic_f_lemmas)
df_freq_lemas_tr = df_freq_lemas.transpose()
print('Most frequent lemmas by category')
display(df_freq_lemas_tr)
df_freq_lemas_tr.to_csv("lemas_freq.csv")
return df_freq_lemas.transpose()
def pos(self):
# POS analysis for each text
dic_pos_cat = {}
for pos in self.POS_LIST:
dic_pos_cat[pos] = {}
for cat in self.dic_categorias:
dic_pos_cat[pos][cat] = []
i = 0
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for text in tqdm(df_grupo['text'].tolist()):
dic_pos = {}
doc = self.nlp(text)
for token in doc:
if token.pos_ in dic_pos:
dic_pos[token.pos_] += 1
else:
dic_pos[token.pos_] = 1
dic_pos_cat[token.pos_][cat].append(token.text)
total = len(doc)
if total == 0:
total = 1
for pos in self.POS_LIST:
if pos in dic_pos:
self.df.loc[i,pos] = round(dic_pos[pos]/total,4)
else:
self.df.loc[i,pos] = np.nan
i+=1
self.dic_pos_cat = dic_pos_cat
# POS analysis for each category
i = 0
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for pos in self.POS_LIST:
if pos in df_grupo.columns.values:
self.df_category.loc[i,'avg_'+pos] = round(df_grupo[pos].mean(), 3)
self.df_category.loc[i,'std_'+pos] = round(df_grupo[pos].std(), 3)
i+=1
print('POS analysis for each text')
display(self.df.head())
print('POS analysis for each category')
display(self.df_category)
self.showGraph(self.POS_LIST,'strip')
self.showGraph(self.POS_LIST,'box')
self.showGraph(self.POS_LIST,'heatmap')
return self.df, self.df_category
def pos_freq(self, n = 15):
# Most frequent words
dic_f_palabras = self.dic_categorias.copy()
for pos in self.POS_LIST:
for cat in self.dic_categorias:
if cat in self.dic_pos_cat[pos]:
text = self.Text(self.dic_pos_cat[pos][cat])
fdist = self.FreqDist(text)
dic_f_palabras[cat] = fdist.most_common(n)
lista = []
for tupla in dic_f_palabras[cat]:
lista.append((tupla[0],round(tupla[1] / len(self.dic_pos_cat[pos][cat]), 5)))
dic_f_palabras[cat] = lista
while (len(dic_f_palabras[cat]) < n): # Rellenar huecos
dic_f_palabras[cat].append(np.nan)
df_freq_palabras = pd.DataFrame(dic_f_palabras)
print("---- Para " + spacy.explain(pos) +" las "+ str(n)+" palabras más frecuentes son: -------")
display(df_freq_palabras.transpose())
df_freq_palabras_tr = df_freq_palabras.transpose()
df_freq_palabras_tr.to_csv("POS_"+ str(pos)+"_freq.csv")
return df_freq_palabras.transpose()
def lexical_diversity(self):
# Lexical diversity for each text
i = 0
for text in tqdm(self.df['text'].tolist()):
flt = ld.flemmatize(text)
self.df.loc[i,'simple_TTR'] = round(ld.ttr(flt), 4)
self.df.loc[i,'root_TTR'] = round(ld.root_ttr(flt), 4)
self.df.loc[i,'log_TTR'] = round(ld.log_ttr(flt), 4)
self.df.loc[i,'maas_TTR'] = round(ld.maas_ttr(flt), 4)
self.df.loc[i,'MSTTR'] = round(ld.msttr(flt), 4)
self.df.loc[i,'MATTR'] = round(ld.mattr(flt), 4)
self.df.loc[i,'HDD'] = round(ld.hdd(flt), 4)
self.df.loc[i,'MTLD'] = round(ld.mtld(flt), 4)
i+=1
# Lexical diversity for each category
i = 0
col_diversity = ['simple_TTR','root_TTR','log_TTR','maas_TTR','MSTTR','MATTR','HDD','MTLD']
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in col_diversity:
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(),4)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(),4)
i+=1
print('Lexical diversity for each text')
display(self.df.head())
print('Lexical diversity for each category')
display(self.df_category)
self.showGraph(col_diversity,'strip')
self.showGraph(col_diversity,'box')
self.showGraph(col_diversity,'heatmap')
return self.df, self.df_category
def complexity(self):
# Complexity diversity for each category
i = 0
for text in tqdm(self.df['text'].tolist()):
if len(text) > 0:
text_processed = self.textComplexitySpacy.textProcessing(text)
pmarks = self.textComplexitySpacy.punctuationMarks()[0]
self.df.loc[i,'lexcomplexity'] = self.textComplexitySpacy.lexicalComplexity()[6]
self.df.loc[i,'ssreadability'] = self.textComplexitySpacy.ssReadability()[1]
self.df.loc[i,'sencomplexity'] = self.textComplexitySpacy.sentenceComplexity()[3]
self.df.loc[i,'autoreadability'] = self.textComplexitySpacy.autoReadability()[1]
embeddingdepth = self.textComplexitySpacy.embeddingDepth()
self.df.loc[i,'max_embeddingdepth'] = embeddingdepth[0]
self.df.loc[i,'min_embeddingdepth'] = embeddingdepth[1]
self.df.loc[i,'avg_embeddingdepth'] = embeddingdepth[2]
readability = self.textComplexitySpacy.readability()
self.df.loc[i,'huertareadability'] = round(readability[3],4)
self.df.loc[i,'ifszreadability'] = round(readability[4],4)
self.df.loc[i,'polinicompressibility'] = round(readability[5],4)
self.df.loc[i,'mureadability'] = round(readability[6],4)
self.df.loc[i,'agereadability'] = self.textComplexitySpacy.ageReadability()[0]
self.df.loc[i,'yearscrawford'] = self.textComplexitySpacy.yearsCrawford()
i+=1
# Complexity diversity for each category
i = 0
col_complexity = ['lexcomplexity','ssreadability','sencomplexity','autoreadability','max_embeddingdepth',
'min_embeddingdepth','avg_embeddingdepth','huertareadability','ifszreadability',
'polinicompressibility','mureadability','agereadability','yearscrawford']
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in col_complexity:
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 4)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 4)
i+=1
print('Complexity diversity for each text')
display(self.df.head())
print('Complexity diversity for each category')
display(self.df_category)
self.showGraph(col_complexity,'strip')
self.showGraph(col_complexity,'box')
self.showGraph(col_complexity,'heatmap')
return self.df, self.df_category
def featureSelection(self):
df = self.df.fillna(0)
X = df.iloc[:,2:]
y = df.iloc[:,0]
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
# Removing features with low variance
sel = VarianceThreshold(threshold=(.8 * (1 - .8))) # No varía en más del 80% de datos
arr = sel.fit_transform(X)
print('Removing features with low variance...')
print('Selected columns:',sel.get_feature_names_out(self.df.columns.values[2:]))
display(pd.DataFrame(arr))
pd.DataFrame(arr).to_csv("VarianceThreshold.csv")
# SelectFromModel
# Selection of functions based on L1
from sklearn.svm import LinearSVC
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
print('Removing features with SelectFromModel...')
print('Selected columns:',model.get_feature_names_out(df.columns.values[2:]))
display(pd.DataFrame(X_new))
pd.DataFrame(X_new).to_csv("SelectFromModel.csv")
def kBest(self, k = 10):
df = self.df.fillna(0)
X = df.iloc[:,2:]
y = df.iloc[:,0]
# Univariate feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif
print('Highest scoring '+ str(k) +' features with f_classif...')
kbest_classif = SelectKBest(f_classif, k=k) # Elimina todo menos las k características de puntuación más alta
X_classif = kbest_classif.fit_transform(X, y)
print('Selected columns:',kbest_classif.get_feature_names_out(self.df.columns.values[2:]))
display(pd.DataFrame(X_classif))
pd.DataFrame(X_classif).to_csv("f_classif.csv")
print('Highest scoring '+ str(k) +' features with mutual_info_classif...')
kbest_mut = SelectKBest(mutual_info_classif, k=k)
X_mut = kbest_mut.fit_transform(X, y)
print('Selected columns:', kbest_mut.get_feature_names_out(self.df.columns.values[2:]))
display(pd.DataFrame(X_mut))
pd.DataFrame(X_mut).to_csv("mutual_info_classif.csv")
\ No newline at end of file
...@@ -2,7 +2,7 @@ from functools import reduce ...@@ -2,7 +2,7 @@ from functools import reduce
import math import math
import syllables import syllables
crea_total_path = '../CREA_total.txt' crea_total_path = './CREA_total.txt'
class TextComplexitySpacy(): class TextComplexitySpacy():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment