Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Alba Maria Mármol
/
TextAnalysisSpacy
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
3c869399
authored
Feb 10, 2022
by
Alba Maria Mármol
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
Update TextAnalisisSpacy.py
parent
dbfb285b
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
65 deletions
TextAnalisisSpacy.py
TextAnalisisSpacy.py
View file @
3c869399
...
...
@@ -54,7 +54,6 @@ class TextAnalisisSpacy():
else
:
self
.
dic_categorias
[
df
.
iloc
[
i
,
0
]]
=
1
self
.
df_category
=
pd
.
DataFrame
({
'category'
:
self
.
dic_categorias
.
keys
()})
print
(
'Dictionary of categories:'
,
self
.
dic_categorias
)
# Initialising variables for graphs
...
...
@@ -88,7 +87,7 @@ class TextAnalisisSpacy():
def
export
(
self
):
print
(
'Exporting...'
)
self
.
df
.
to_csv
(
"data.csv"
)
self
.
df
_category
.
to_csv
(
"data_cat.csv"
)
self
.
df
.
groupby
(
'category'
)
.
agg
([
'mean'
,
'median'
,
'std'
])
.
to_csv
(
"data_cat.csv"
)
self
.
showGraph
(
self
.
df
.
columns
[
2
:],
'strip'
,
True
)
self
.
showGraph
(
self
.
df
.
columns
[
2
:],
'box'
,
True
)
self
.
showGraph
(
self
.
df
.
columns
[
2
:],
'heatmap'
,
True
)
...
...
@@ -103,30 +102,16 @@ class TextAnalisisSpacy():
# Volumetrics for each category
volumetry
=
[
'words'
,
'uniques'
,
'chars'
,
'avg_words_len'
]
category_columns
=
[
'category'
,
'docs'
]
for
col
in
volumetry
:
category_columns
.
append
(
'avg_'
+
col
)
category_columns
.
append
(
'std_'
+
col
)
i
=
0
groups
=
self
.
df
.
groupby
(
self
.
df
.
category
)
for
cat
in
self
.
dic_categorias
:
df_grupo
=
groups
.
get_group
(
cat
)
for
col
in
volumetry
:
self
.
df_category
.
loc
[
i
,
'docs'
]
=
len
(
df_grupo
)
self
.
df_category
.
loc
[
i
,
'avg_'
+
col
]
=
round
(
df_grupo
[
col
]
.
mean
(),
3
)
self
.
df_category
.
loc
[
i
,
'std_'
+
col
]
=
round
(
df_grupo
[
col
]
.
std
(),
5
)
i
+=
1
print
(
'Volumetrics for each text:'
)
display
(
self
.
df
.
head
())
print
(
'Volumetrics for each category:'
)
display
(
self
.
df
_category
[
category_columns
]
)
display
(
self
.
df
.
groupby
(
'category'
)
.
agg
([
'mean'
,
'median'
,
'std'
])
)
self
.
showGraph
(
volumetry
,
'strip'
)
self
.
showGraph
(
volumetry
,
'box'
)
self
.
showGraph
(
volumetry
,
'heatmap'
)
return
self
.
df
,
self
.
df_category
return
self
.
df
def
lemmas
(
self
):
# Number and length of different lemmas per text
...
...
@@ -156,32 +141,17 @@ class TextAnalisisSpacy():
self
.
dic_lemmas
=
dic_lemmas
# Average and variance of different lemmas and length by category
i
=
0
col_lemmas
=
[
'lemmas_uniques'
,
'avg_lemmas_len'
]
category_lemmas
=
[
'category'
]
for
col
in
col_lemmas
:
category_lemmas
.
append
(
'avg_'
+
col
)
category_lemmas
.
append
(
'std_'
+
col
)
groups
=
self
.
df
.
groupby
(
self
.
df
.
category
)
for
cat
in
self
.
dic_categorias
:
df_grupo
=
groups
.
get_group
(
cat
)
for
col
in
col_lemmas
:
self
.
df_category
.
loc
[
i
,
'docs'
]
=
len
(
df_grupo
)
self
.
df_category
.
loc
[
i
,
'avg_'
+
col
]
=
round
(
df_grupo
[
col
]
.
mean
(),
3
)
self
.
df_category
.
loc
[
i
,
'std_'
+
col
]
=
round
(
df_grupo
[
col
]
.
std
(),
3
)
i
+=
1
print
(
'Lemmas for each text:'
)
display
(
self
.
df
.
head
())
print
(
'Lemmas for each category:'
)
display
(
self
.
df
_category
[
category_lemmas
]
)
display
(
self
.
df
.
groupby
(
'category'
)
.
agg
([
'mean'
,
'median'
,
'std'
])
)
self
.
showGraph
(
col_lemmas
,
'strip'
)
self
.
showGraph
(
col_lemmas
,
'box'
)
self
.
showGraph
(
col_lemmas
,
'heatmap'
)
return
self
.
df
,
self
.
df_category
return
self
.
df
def
lemmas_freq
(
self
,
n
=
50
):
# Most frequent lemmas by category
...
...
@@ -236,24 +206,14 @@ class TextAnalisisSpacy():
self
.
dic_pos_cat
=
dic_pos_cat
# POS analysis for each category
i
=
0
groups
=
self
.
df
.
groupby
(
self
.
df
.
category
)
for
cat
in
self
.
dic_categorias
:
df_grupo
=
groups
.
get_group
(
cat
)
for
pos
in
self
.
POS_LIST
:
if
pos
in
df_grupo
.
columns
.
values
:
self
.
df_category
.
loc
[
i
,
'avg_'
+
pos
]
=
round
(
df_grupo
[
pos
]
.
mean
(),
3
)
self
.
df_category
.
loc
[
i
,
'std_'
+
pos
]
=
round
(
df_grupo
[
pos
]
.
std
(),
3
)
i
+=
1
print
(
'POS analysis for each text'
)
display
(
self
.
df
.
head
())
print
(
'POS analysis for each category'
)
display
(
self
.
df
_category
)
display
(
self
.
df
.
groupby
(
'category'
)
.
agg
([
'mean'
,
'median'
,
'std'
])
)
self
.
showGraph
(
self
.
POS_LIST
,
'strip'
)
self
.
showGraph
(
self
.
POS_LIST
,
'box'
)
self
.
showGraph
(
self
.
POS_LIST
,
'heatmap'
)
return
self
.
df
,
self
.
df_category
return
self
.
df
def
pos_freq
(
self
,
n
=
15
):
# Most frequent words
...
...
@@ -297,21 +257,15 @@ class TextAnalisisSpacy():
# Lexical diversity for each category
i
=
0
col_diversity
=
[
'simple_TTR'
,
'root_TTR'
,
'log_TTR'
,
'maas_TTR'
,
'MSTTR'
,
'MATTR'
,
'HDD'
,
'MTLD'
]
groups
=
self
.
df
.
groupby
(
self
.
df
.
category
)
for
cat
in
self
.
dic_categorias
:
df_grupo
=
groups
.
get_group
(
cat
)
for
col
in
col_diversity
:
self
.
df_category
.
loc
[
i
,
'avg_'
+
col
]
=
round
(
df_grupo
[
col
]
.
mean
(),
4
)
self
.
df_category
.
loc
[
i
,
'std_'
+
col
]
=
round
(
df_grupo
[
col
]
.
std
(),
4
)
i
+=
1
print
(
'Lexical diversity for each text'
)
display
(
self
.
df
.
head
())
print
(
'Lexical diversity for each category'
)
display
(
self
.
df
_category
)
display
(
self
.
df
.
groupby
(
'category'
)
.
agg
([
'mean'
,
'median'
,
'std'
])
)
self
.
showGraph
(
col_diversity
,
'strip'
)
self
.
showGraph
(
col_diversity
,
'box'
)
self
.
showGraph
(
col_diversity
,
'heatmap'
)
return
self
.
df
,
self
.
df_category
return
self
.
df
def
complexity
(
self
):
# Complexity diversity for each category
...
...
@@ -342,22 +296,15 @@ class TextAnalisisSpacy():
col_complexity
=
[
'lexcomplexity'
,
'ssreadability'
,
'sencomplexity'
,
'autoreadability'
,
'max_embeddingdepth'
,
'min_embeddingdepth'
,
'avg_embeddingdepth'
,
'huertareadability'
,
'ifszreadability'
,
'polinicompressibility'
,
'mureadability'
,
'agereadability'
,
'yearscrawford'
]
groups
=
self
.
df
.
groupby
(
self
.
df
.
category
)
for
cat
in
self
.
dic_categorias
:
df_grupo
=
groups
.
get_group
(
cat
)
for
col
in
col_complexity
:
self
.
df_category
.
loc
[
i
,
'avg_'
+
col
]
=
round
(
df_grupo
[
col
]
.
mean
(),
4
)
self
.
df_category
.
loc
[
i
,
'std_'
+
col
]
=
round
(
df_grupo
[
col
]
.
std
(),
4
)
i
+=
1
print
(
'Complexity diversity for each text'
)
display
(
self
.
df
.
head
())
print
(
'Complexity diversity for each category'
)
display
(
self
.
df
_category
)
display
(
self
.
df
.
groupby
(
'category'
)
.
agg
([
'mean'
,
'median'
,
'std'
])
)
self
.
showGraph
(
col_complexity
,
'strip'
)
self
.
showGraph
(
col_complexity
,
'box'
)
self
.
showGraph
(
col_complexity
,
'heatmap'
)
return
self
.
df
,
self
.
df_category
return
self
.
df
def
featureSelection
(
self
):
df
=
self
.
df
.
fillna
(
0
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment