Update TextAnalisisSpacy.py

parent dbfb285b
Showing with 12 additions and 65 deletions
...@@ -54,7 +54,6 @@ class TextAnalisisSpacy(): ...@@ -54,7 +54,6 @@ class TextAnalisisSpacy():
else: else:
self.dic_categorias[df.iloc[i,0]] = 1 self.dic_categorias[df.iloc[i,0]] = 1
self.df_category = pd.DataFrame({'category': self.dic_categorias.keys()})
print('Dictionary of categories:', self.dic_categorias) print('Dictionary of categories:', self.dic_categorias)
# Initialising variables for graphs # Initialising variables for graphs
...@@ -88,7 +87,7 @@ class TextAnalisisSpacy(): ...@@ -88,7 +87,7 @@ class TextAnalisisSpacy():
def export(self): def export(self):
print('Exporting...') print('Exporting...')
self.df.to_csv("data.csv") self.df.to_csv("data.csv")
self.df_category.to_csv("data_cat.csv") self.df.groupby('category').agg(['mean', 'median','std']).to_csv("data_cat.csv")
self.showGraph(self.df.columns[2:],'strip',True) self.showGraph(self.df.columns[2:],'strip',True)
self.showGraph(self.df.columns[2:],'box',True) self.showGraph(self.df.columns[2:],'box',True)
self.showGraph(self.df.columns[2:],'heatmap',True) self.showGraph(self.df.columns[2:],'heatmap',True)
...@@ -103,30 +102,16 @@ class TextAnalisisSpacy(): ...@@ -103,30 +102,16 @@ class TextAnalisisSpacy():
# Volumetrics for each category # Volumetrics for each category
volumetry = ['words','uniques','chars','avg_words_len'] volumetry = ['words','uniques','chars','avg_words_len']
category_columns = ['category','docs']
for col in volumetry:
category_columns.append('avg_'+col)
category_columns.append('std_'+col)
i = 0
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in volumetry:
self.df_category.loc[i,'docs'] = len(df_grupo)
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 5)
i+=1
print('Volumetrics for each text:') print('Volumetrics for each text:')
display(self.df.head()) display(self.df.head())
print('Volumetrics for each category:') print('Volumetrics for each category:')
display(self.df_category[category_columns]) display(self.df.groupby('category').agg(['mean', 'median','std']))
self.showGraph(volumetry,'strip') self.showGraph(volumetry,'strip')
self.showGraph(volumetry,'box') self.showGraph(volumetry,'box')
self.showGraph(volumetry,'heatmap') self.showGraph(volumetry,'heatmap')
return self.df, self.df_category return self.df
def lemmas(self): def lemmas(self):
# Number and length of different lemmas per text # Number and length of different lemmas per text
...@@ -156,32 +141,17 @@ class TextAnalisisSpacy(): ...@@ -156,32 +141,17 @@ class TextAnalisisSpacy():
self.dic_lemmas = dic_lemmas self.dic_lemmas = dic_lemmas
# Average and variance of different lemmas and length by category # Average and variance of different lemmas and length by category
i = 0
col_lemmas = ['lemmas_uniques','avg_lemmas_len'] col_lemmas = ['lemmas_uniques','avg_lemmas_len']
category_lemmas = ['category']
for col in col_lemmas:
category_lemmas.append('avg_'+col)
category_lemmas.append('std_'+col)
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in col_lemmas:
self.df_category.loc[i,'docs'] = len(df_grupo)
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 3)
i+=1
print('Lemmas for each text:') print('Lemmas for each text:')
display(self.df.head()) display(self.df.head())
print('Lemmas for each category:') print('Lemmas for each category:')
display(self.df_category[category_lemmas]) display(self.df.groupby('category').agg(['mean', 'median','std']))
self.showGraph(col_lemmas,'strip') self.showGraph(col_lemmas,'strip')
self.showGraph(col_lemmas,'box') self.showGraph(col_lemmas,'box')
self.showGraph(col_lemmas,'heatmap') self.showGraph(col_lemmas,'heatmap')
return self.df, self.df_category return self.df
def lemmas_freq(self, n = 50): def lemmas_freq(self, n = 50):
# Most frequent lemmas by category # Most frequent lemmas by category
...@@ -236,24 +206,14 @@ class TextAnalisisSpacy(): ...@@ -236,24 +206,14 @@ class TextAnalisisSpacy():
self.dic_pos_cat = dic_pos_cat self.dic_pos_cat = dic_pos_cat
# POS analysis for each category # POS analysis for each category
i = 0
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for pos in self.POS_LIST:
if pos in df_grupo.columns.values:
self.df_category.loc[i,'avg_'+pos] = round(df_grupo[pos].mean(), 3)
self.df_category.loc[i,'std_'+pos] = round(df_grupo[pos].std(), 3)
i+=1
print('POS analysis for each text') print('POS analysis for each text')
display(self.df.head()) display(self.df.head())
print('POS analysis for each category') print('POS analysis for each category')
display(self.df_category) display(self.df.groupby('category').agg(['mean', 'median','std']))
self.showGraph(self.POS_LIST,'strip') self.showGraph(self.POS_LIST,'strip')
self.showGraph(self.POS_LIST,'box') self.showGraph(self.POS_LIST,'box')
self.showGraph(self.POS_LIST,'heatmap') self.showGraph(self.POS_LIST,'heatmap')
return self.df, self.df_category return self.df
def pos_freq(self, n = 15): def pos_freq(self, n = 15):
# Most frequent words # Most frequent words
...@@ -297,21 +257,15 @@ class TextAnalisisSpacy(): ...@@ -297,21 +257,15 @@ class TextAnalisisSpacy():
# Lexical diversity for each category # Lexical diversity for each category
i = 0 i = 0
col_diversity = ['simple_TTR','root_TTR','log_TTR','maas_TTR','MSTTR','MATTR','HDD','MTLD'] col_diversity = ['simple_TTR','root_TTR','log_TTR','maas_TTR','MSTTR','MATTR','HDD','MTLD']
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in col_diversity:
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(),4)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(),4)
i+=1
print('Lexical diversity for each text') print('Lexical diversity for each text')
display(self.df.head()) display(self.df.head())
print('Lexical diversity for each category') print('Lexical diversity for each category')
display(self.df_category) display(self.df.groupby('category').agg(['mean', 'median','std']))
self.showGraph(col_diversity,'strip') self.showGraph(col_diversity,'strip')
self.showGraph(col_diversity,'box') self.showGraph(col_diversity,'box')
self.showGraph(col_diversity,'heatmap') self.showGraph(col_diversity,'heatmap')
return self.df, self.df_category return self.df
def complexity(self): def complexity(self):
# Complexity diversity for each category # Complexity diversity for each category
...@@ -342,22 +296,15 @@ class TextAnalisisSpacy(): ...@@ -342,22 +296,15 @@ class TextAnalisisSpacy():
col_complexity = ['lexcomplexity','ssreadability','sencomplexity','autoreadability','max_embeddingdepth', col_complexity = ['lexcomplexity','ssreadability','sencomplexity','autoreadability','max_embeddingdepth',
'min_embeddingdepth','avg_embeddingdepth','huertareadability','ifszreadability', 'min_embeddingdepth','avg_embeddingdepth','huertareadability','ifszreadability',
'polinicompressibility','mureadability','agereadability','yearscrawford'] 'polinicompressibility','mureadability','agereadability','yearscrawford']
groups = self.df.groupby(self.df.category)
for cat in self.dic_categorias:
df_grupo = groups.get_group(cat)
for col in col_complexity:
self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 4)
self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 4)
i+=1
print('Complexity diversity for each text') print('Complexity diversity for each text')
display(self.df.head()) display(self.df.head())
print('Complexity diversity for each category') print('Complexity diversity for each category')
display(self.df_category) display(self.df.groupby('category').agg(['mean', 'median','std']))
self.showGraph(col_complexity,'strip') self.showGraph(col_complexity,'strip')
self.showGraph(col_complexity,'box') self.showGraph(col_complexity,'box')
self.showGraph(col_complexity,'heatmap') self.showGraph(col_complexity,'heatmap')
return self.df, self.df_category return self.df
def featureSelection(self): def featureSelection(self):
df = self.df.fillna(0) df = self.df.fillna(0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment