Update TextAnalisisSpacy.py

parent dbfb285b
Showing with 12 additions and 65 deletions
......@@ -54,7 +54,6 @@ class TextAnalisisSpacy():
else:
self.dic_categorias[df.iloc[i,0]] = 1
self.df_category = pd.DataFrame({'category': self.dic_categorias.keys()})
print('Dictionary of categories:', self.dic_categorias)
# Initialising variables for graphs
......@@ -88,7 +87,7 @@ class TextAnalisisSpacy():
def export(self):
print('Exporting...')
self.df.to_csv("data.csv")
self.df_category.to_csv("data_cat.csv")
self.df.groupby('category').agg(['mean', 'median','std']).to_csv("data_cat.csv")
self.showGraph(self.df.columns[2:],'strip',True)
self.showGraph(self.df.columns[2:],'box',True)
self.showGraph(self.df.columns[2:],'heatmap',True)
......@@ -103,30 +102,16 @@ class TextAnalisisSpacy():
# Volumetrics for each category
volumetry = ['words','uniques','chars','avg_words_len']
category_columns = ['category','docs']
for col in volumetry:
category_columns.append('avg_'+col)
category_columns.append('std_'+col)
i = 0
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in volumetry:
self.df_category.loc[i,'docs'] = len(df_grupo)
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 5)
i+=1
print('Volumetrics for each text:')
display(self.df.head())
print('Volumetrics for each category:')
display(self.df_category[category_columns])
display(self.df.groupby('category').agg(['mean', 'median','std']))
self.showGraph(volumetry,'strip')
self.showGraph(volumetry,'box')
self.showGraph(volumetry,'heatmap')
return self.df, self.df_category
return self.df
def lemmas(self):
# Number and length of different lemmas per text
......@@ -156,32 +141,17 @@ class TextAnalisisSpacy():
self.dic_lemmas = dic_lemmas
# Average and variance of different lemmas and length by category
i = 0
col_lemmas = ['lemmas_uniques','avg_lemmas_len']
category_lemmas = ['category']
for col in col_lemmas:
category_lemmas.append('avg_'+col)
category_lemmas.append('std_'+col)
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in col_lemmas:
self.df_category.loc[i,'docs'] = len(df_grupo)
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 3)
i+=1
print('Lemmas for each text:')
display(self.df.head())
print('Lemmas for each category:')
display(self.df_category[category_lemmas])
display(self.df.groupby('category').agg(['mean', 'median','std']))
self.showGraph(col_lemmas,'strip')
self.showGraph(col_lemmas,'box')
self.showGraph(col_lemmas,'heatmap')
return self.df, self.df_category
return self.df
def lemmas_freq(self, n = 50):
# Most frequent lemmas by category
......@@ -236,24 +206,14 @@ class TextAnalisisSpacy():
self.dic_pos_cat = dic_pos_cat
# POS analysis for each category
i = 0
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for pos in self.POS_LIST:
if pos in df_grupo.columns.values:
self.df_category.loc[i,'avg_'+pos] = round(df_grupo[pos].mean(), 3)
self.df_category.loc[i,'std_'+pos] = round(df_grupo[pos].std(), 3)
i+=1
print('POS analysis for each text')
display(self.df.head())
print('POS analysis for each category')
display(self.df_category)
display(self.df.groupby('category').agg(['mean', 'median','std']))
self.showGraph(self.POS_LIST,'strip')
self.showGraph(self.POS_LIST,'box')
self.showGraph(self.POS_LIST,'heatmap')
return self.df, self.df_category
return self.df
def pos_freq(self, n = 15):
# Most frequent words
......@@ -297,21 +257,15 @@ class TextAnalisisSpacy():
# Lexical diversity for each category
i = 0
col_diversity = ['simple_TTR','root_TTR','log_TTR','maas_TTR','MSTTR','MATTR','HDD','MTLD']
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in col_diversity:
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(),4)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(),4)
i+=1
print('Lexical diversity for each text')
display(self.df.head())
print('Lexical diversity for each category')
display(self.df_category)
display(self.df.groupby('category').agg(['mean', 'median','std']))
self.showGraph(col_diversity,'strip')
self.showGraph(col_diversity,'box')
self.showGraph(col_diversity,'heatmap')
return self.df, self.df_category
return self.df
def complexity(self):
# Complexity diversity for each category
......@@ -342,22 +296,15 @@ class TextAnalisisSpacy():
col_complexity = ['lexcomplexity','ssreadability','sencomplexity','autoreadability','max_embeddingdepth',
'min_embeddingdepth','avg_embeddingdepth','huertareadability','ifszreadability',
'polinicompressibility','mureadability','agereadability','yearscrawford']
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in col_complexity:
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 4)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 4)
i+=1
print('Complexity diversity for each text')
display(self.df.head())
print('Complexity diversity for each category')
display(self.df_category)
display(self.df.groupby('category').agg(['mean', 'median','std']))
self.showGraph(col_complexity,'strip')
self.showGraph(col_complexity,'box')
self.showGraph(col_complexity,'heatmap')
return self.df, self.df_category
return self.df
def featureSelection(self):
df = self.df.fillna(0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment