Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SINAI
/
texty
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
e27fddf2
authored
Mar 05, 2022
by
Arturo Montejo Ráez
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
complexity analyzer refactored
parent
6330f677
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
66 additions
and
28 deletions
example.ipynb → examples/example.ipynb
texts.csv → examples/texts.csv
CREA_total.txt → src/texty/CREA_total.txt
__init__.py → src/texty/__init__.py
TextAnalysisSpacy.py → src/texty/analyzer.py
TextComplexitySpacy.py → src/texty/complexity.py
example.ipynb
→
example
s/example
.ipynb
View file @
e27fddf2
File moved
texts.csv
→
examples/
texts.csv
View file @
e27fddf2
File moved
CREA_total.txt
→
src/texty/
CREA_total.txt
View file @
e27fddf2
The file could not be displayed because it is too large.
__init__.py
→
src/texty/
__init__.py
View file @
e27fddf2
File moved
TextAnalysisSpacy
.py
→
src/texty/analyzer
.py
View file @
e27fddf2
...
@@ -7,7 +7,7 @@ import numpy as np
...
@@ -7,7 +7,7 @@ import numpy as np
from
tqdm
import
tqdm
from
tqdm
import
tqdm
import
re
import
re
import
pandas
as
pd
import
pandas
as
pd
from
TextAnalysisSpacy.TextComplexitySpacy
import
TextComplexitySpacy
from
texty.complexity
import
ComplexityAnalyzer
import
matplotlib.pyplot
as
plt
import
matplotlib.pyplot
as
plt
#%matplotlib inline ## when in Jupyter
#%matplotlib inline ## when in Jupyter
...
@@ -18,7 +18,7 @@ from nltk.text import Text
...
@@ -18,7 +18,7 @@ from nltk.text import Text
from
lexical_diversity
import
lex_div
as
ld
from
lexical_diversity
import
lex_div
as
ld
from
transformers
import
pipeline
from
transformers
import
pipeline
class
TextAnalysisSpacy
():
class
Analyzer
():
def
__init__
(
self
,
lang
=
'es'
):
def
__init__
(
self
,
lang
=
'es'
):
...
@@ -26,15 +26,14 @@ class TextAnalysisSpacy():
...
@@ -26,15 +26,14 @@ class TextAnalysisSpacy():
if
lang
==
'es'
:
if
lang
==
'es'
:
spacy
.
cli
.
download
(
"es_core_news_sm"
)
spacy
.
cli
.
download
(
"es_core_news_sm"
)
self
.
nlp
=
spacy
.
load
(
"es_core_news_sm"
)
self
.
nlp
=
spacy
.
load
(
"es_core_news_sm"
)
self
.
textComplexitySpacy
=
TextComplexitySpacy
()
elif
lang
==
'en'
:
elif
lang
==
'en'
:
spacy
.
cli
.
download
(
"en_core_web_sm"
)
self
.
nlp
=
spacy
.
load
(
"en_core_web_sm"
)
self
.
nlp
=
spacy
.
load
(
"en_core_web_sm"
)
self
.
textComplexitySpacy
=
TextComplexitySpacy
(
'en'
)
self
.
complexity_analyzer
=
ComplexityAnalyzer
(
self
.
nlp
)
self
.
Text
=
Text
self
.
Text
=
Text
self
.
FreqDist
=
FreqDist
self
.
FreqDist
=
FreqDist
self
.
POS_LIST
=
[
"ADJ"
,
"ADP"
,
"ADV"
,
"AUX"
,
"X"
,
"CCONJ"
,
"CONJ"
,
"DET"
,
"INTJ"
,
"NOUN"
,
"NUM"
,
"PART"
,
"PRON"
,
"PROPN"
,
"PUNCT"
,
"SCONJ"
,
"SYM"
,
"VERB"
,
"SPACE"
]
self
.
POS_LIST
=
[
"ADJ"
,
"ADP"
,
"ADV"
,
"AUX"
,
"X"
,
"CCONJ"
,
"CONJ"
,
"DET"
,
"INTJ"
,
"NOUN"
,
"NUM"
,
"PART"
,
"PRON"
,
"PROPN"
,
"PUNCT"
,
"SCONJ"
,
"SYM"
,
"VERB"
,
"SPACE"
]
pass
#
#
# X = samples input , y = tags
# X = samples input , y = tags
...
...
TextComplexitySpac
y.py
→
src/texty/complexit
y.py
View file @
e27fddf2
from
functools
import
reduce
from
functools
import
reduce
from
tkinter.font
import
_MetricsDict
import
spacy
import
spacy
import
math
import
math
import
syllables
import
syllables
import
os
import
re
crea_total_path
=
'./CREA_total.txt'
crea_total_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'CREA_total.txt'
)
class
ComplexityAnalyzer
():
class
TextComplexitySpacy
():
def
__init__
(
self
,
lang
=
'es'
):
def
__init__
(
self
,
lang
,
spacy_nlp
):
self
.
lang
=
lang
self
.
lang
=
lang
self
.
nlp
=
spacy_nlp
# create language analyzer
# create language analyzer
if
lang
==
'es'
:
if
lang
==
'es'
:
self
.
nlp
=
spacy
.
load
(
"es_core_news_sm"
)
# Load CREA
with
open
(
crea_total_path
)
as
f
:
lines
=
f
.
readlines
()
f
.
close
()
crea
=
{}
for
l
in
lines
[
1
:
1000
]:
# those words not in the 1000 most frequent words in CREA are low frequency words
data
=
l
.
strip
()
.
split
()
crea
[
data
[
1
]]
=
float
(
data
[
2
]
.
replace
(
','
,
''
))
self
.
lang_word_freqs
=
crea
if
lang
==
'en'
:
if
lang
==
'en'
:
self
.
nlp
=
spacy
.
load
(
"en_core_web_sm"
)
self
.
lang_word_freqs
=
{}
# Para leer el texto que introducimos
f
=
open
(
crea_total_path
)
lines
=
f
.
readlines
()
f
.
close
()
crea
=
{}
for
l
in
lines
[
1
:
1000
]:
# those words not in the 1000 most frequent words in CREA are low frequency words
data
=
l
.
strip
()
.
split
()
crea
[
data
[
1
]]
=
float
(
data
[
2
]
.
replace
(
','
,
''
))
self
.
crea
=
crea
pass
def
textProcessing
(
self
,
text
):
def
read
(
self
,
text
):
# Meter todas las funciones en una patron de los tokens válidos
# Meter todas las funciones en una patron de los tokens válidos
doc
=
self
.
nlp
(
text
)
doc
=
self
.
nlp
(
text
)
self
.
tokens
=
[
w
for
w
in
doc
]
self
.
tokens
=
[
w
for
w
in
doc
]
...
@@ -43,9 +43,45 @@ class TextComplexitySpacy():
...
@@ -43,9 +43,45 @@ class TextComplexitySpacy():
ws
=
self
.
nlp
(
sentence
.
text
)
ws
=
self
.
nlp
(
sentence
.
text
)
pos_content_sentences
.
append
([
w
for
w
in
ws
if
re
.
match
(
'NOUN.*|VERB.*|ADJ.*'
,
w
.
pos_
)])
pos_content_sentences
.
append
([
w
for
w
in
ws
if
re
.
match
(
'NOUN.*|VERB.*|ADJ.*'
,
w
.
pos_
)])
self
.
pos_content_sentences
=
pos_content_sentences
self
.
pos_content_sentences
=
pos_content_sentences
return
self
.
pos_content_sentences
def
get_all_metrics
(
self
):
self
.
punctuationMarks
()
self
.
lexicalComplexity
()
self
.
ssReadability
()
self
.
sentenceComplexity
()
self
.
autoReadability
()
self
.
embeddingDepth
()
self
.
readability
()
self
.
ageReadability
()
self
.
yearsCrawford
()
metrics
=
{
'npunct'
:
self
.
npunctuation
,
# number of punctuation marks
'nword'
:
self
.
N_words
,
# number of non punctiation tokens (words)
'ILFW'
:
self
.
ILFW
,
# index of low frequency words
'LDI'
:
self
.
LDI
,
# lexical diversity index
'LC'
:
self
.
LC
,
# lexical complexity index
'nrword'
:
self
.
N_rw
,
# number of rare words
'SSR'
:
self
.
SSR
,
# Spaulding's readability score
'avgsentl'
:
self
.
ASL
,
# average sentences length
'ncompsent'
:
self
.
N_cs
,
# number of complex sentences (those with composed verbs)
'nsent'
:
self
.
N_s
,
# number of sentences
'SCI'
:
self
.
SCI
,
# sentence complexity index
'nchar'
:
self
.
N_charac
,
# number of characters
'ARI'
:
self
.
ARI
,
# auto readability index
'min_depth'
:
self
.
min_max_list
,
# minimum of maximum tree depths
'max_depth'
:
self
.
max_max_list
,
# maximum of maximum tree depths
'mean_depth'
:
self
.
mean_max_list
,
# mean of maximum tree depths
'nsyllab'
:
self
.
n_syllables
,
# number of syllables
'huerta'
:
self
.
huertareadability
,
# Huerta's readability
'IFSZ'
:
self
.
ifszreadability
,
# Flesch-Szigrist legibility
'polini'
:
self
.
polinicompressibility
,
# Polini's compressibility
'mu'
:
self
.
mureadability
,
# Mu readability
'minage'
:
self
.
minimumage
,
# minimum age
'SOL'
:
self
.
solreadability
,
# SOL readability
'crawford'
:
self
.
years
# Crawford's years
}
return
metrics
def
punctuationMarks
(
self
):
def
punctuationMarks
(
self
):
# Solo nos interesa contar los tokens que sean signo de puntuación.
# Solo nos interesa contar los tokens que sean signo de puntuación.
...
@@ -108,6 +144,9 @@ class TextComplexitySpacy():
...
@@ -108,6 +144,9 @@ class TextComplexitySpacy():
return
self
.
N_lfw
,
self
.
N_cw
,
self
.
N_dcw
,
self
.
N_s
,
self
.
LDI
,
self
.
ILFW
,
self
.
LC
return
self
.
N_lfw
,
self
.
N_cw
,
self
.
N_dcw
,
self
.
N_s
,
self
.
LDI
,
self
.
ILFW
,
self
.
LC
def
ssReadability
(
self
):
def
ssReadability
(
self
):
'''
Spaulding Score of Readability
'''
#Number of rare words
#Number of rare words
byfreq
=
sorted
(
self
.
crea
,
key
=
self
.
crea
.
__getitem__
,
reverse
=
True
)
byfreq
=
sorted
(
self
.
crea
,
key
=
self
.
crea
.
__getitem__
,
reverse
=
True
)
byfreq
=
byfreq
[:
1500
]
byfreq
=
byfreq
[:
1500
]
...
@@ -202,7 +241,7 @@ class TextComplexitySpacy():
...
@@ -202,7 +241,7 @@ class TextComplexitySpacy():
return
self
.
max_max_list
,
self
.
min_max_list
,
self
.
mean_max_list
return
self
.
max_max_list
,
self
.
min_max_list
,
self
.
mean_max_list
def
syllable_counter_spanish
(
self
,
text
):
def
syllable_counter_spanish
(
self
,
text
):
if
self
.
lang
==
'es'
:
if
self
.
lang
==
'es'
:
t
=
re
.
sub
(
r'y([aáeéiíoóuú])'
,
'
\\
1'
,
text
.
lower
())
t
=
re
.
sub
(
r'y([aáeéiíoóuú])'
,
'
\\
1'
,
text
.
lower
())
t
=
re
.
sub
(
r'[aáeéioóu][iuy]'
,
'A'
,
t
.
lower
())
t
=
re
.
sub
(
r'[aáeéioóu][iuy]'
,
'A'
,
t
.
lower
())
...
@@ -265,7 +304,7 @@ class TextComplexitySpacy():
...
@@ -265,7 +304,7 @@ class TextComplexitySpacy():
self
.
mureadability
=
mureadability
self
.
mureadability
=
mureadability
return
self
.
n_syllables
,
self
.
n_syllables3
,
self
.
nletters
,
self
.
huertareadability
,
self
.
ifszreadability
,
self
.
polinicompressibility
,
self
.
mureadability
,
self
.
syll_words
,
self
.
words_sen
return
self
.
n_syllables
,
self
.
n_syllables3
,
self
.
nletters
,
self
.
huertareadability
,
self
.
ifszreadability
,
self
.
polinicompressibility
,
self
.
mureadability
,
self
.
syll_words
,
self
.
words_sen
def
ageReadability
(
self
):
def
ageReadability
(
self
):
minimumage
=
0.2495
*
(
self
.
N_words
/
self
.
nsentences
)
+
6.4763
*
(
self
.
n_syllables
/
self
.
N_words
)
-
7.1395
minimumage
=
0.2495
*
(
self
.
N_words
/
self
.
nsentences
)
+
6.4763
*
(
self
.
n_syllables
/
self
.
N_words
)
-
7.1395
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment