Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SINAI
/
texty
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
e27fddf2
authored
Mar 05, 2022
by
Arturo Montejo Ráez
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
complexity analyzer refactored
parent
6330f677
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
66 additions
and
28 deletions
example.ipynb → examples/example.ipynb
texts.csv → examples/texts.csv
CREA_total.txt → src/texty/CREA_total.txt
__init__.py → src/texty/__init__.py
TextAnalysisSpacy.py → src/texty/analyzer.py
TextComplexitySpacy.py → src/texty/complexity.py
example.ipynb
→
example
s/example
.ipynb
View file @
e27fddf2
File moved
texts.csv
→
examples/
texts.csv
View file @
e27fddf2
File moved
CREA_total.txt
→
src/texty/
CREA_total.txt
View file @
e27fddf2
The file could not be displayed because it is too large.
__init__.py
→
src/texty/
__init__.py
View file @
e27fddf2
File moved
TextAnalysisSpacy
.py
→
src/texty/analyzer
.py
View file @
e27fddf2
...
...
@@ -7,7 +7,7 @@ import numpy as np
from
tqdm
import
tqdm
import
re
import
pandas
as
pd
from
TextAnalysisSpacy.TextComplexitySpacy
import
TextComplexitySpacy
from
texty.complexity
import
ComplexityAnalyzer
import
matplotlib.pyplot
as
plt
#%matplotlib inline ## when in Jupyter
...
...
@@ -18,7 +18,7 @@ from nltk.text import Text
from
lexical_diversity
import
lex_div
as
ld
from
transformers
import
pipeline
class
TextAnalysisSpacy
():
class
Analyzer
():
def
__init__
(
self
,
lang
=
'es'
):
...
...
@@ -26,15 +26,14 @@ class TextAnalysisSpacy():
if
lang
==
'es'
:
spacy
.
cli
.
download
(
"es_core_news_sm"
)
self
.
nlp
=
spacy
.
load
(
"es_core_news_sm"
)
self
.
textComplexitySpacy
=
TextComplexitySpacy
()
elif
lang
==
'en'
:
spacy
.
cli
.
download
(
"en_core_web_sm"
)
self
.
nlp
=
spacy
.
load
(
"en_core_web_sm"
)
self
.
textComplexitySpacy
=
TextComplexitySpacy
(
'en'
)
self
.
complexity_analyzer
=
ComplexityAnalyzer
(
self
.
nlp
)
self
.
Text
=
Text
self
.
FreqDist
=
FreqDist
self
.
POS_LIST
=
[
"ADJ"
,
"ADP"
,
"ADV"
,
"AUX"
,
"X"
,
"CCONJ"
,
"CONJ"
,
"DET"
,
"INTJ"
,
"NOUN"
,
"NUM"
,
"PART"
,
"PRON"
,
"PROPN"
,
"PUNCT"
,
"SCONJ"
,
"SYM"
,
"VERB"
,
"SPACE"
]
pass
#
# X = samples input , y = tags
...
...
TextComplexitySpac
y.py
→
src/texty/complexit
y.py
View file @
e27fddf2
from
functools
import
reduce
from
tkinter.font
import
_MetricsDict
import
spacy
import
math
import
syllables
import
os
import
re
crea_total_path
=
'./CREA_total.txt'
crea_total_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'CREA_total.txt'
)
class
TextComplexitySpacy
():
class
ComplexityAnalyzer
():
def
__init__
(
self
,
lang
=
'es'
):
def
__init__
(
self
,
lang
,
spacy_nlp
):
self
.
lang
=
lang
self
.
nlp
=
spacy_nlp
# create language analyzer
if
lang
==
'es'
:
self
.
nlp
=
spacy
.
load
(
"es_core_news_sm"
)
# Load CREA
with
open
(
crea_total_path
)
as
f
:
lines
=
f
.
readlines
()
f
.
close
()
crea
=
{}
for
l
in
lines
[
1
:
1000
]:
# those words not in the 1000 most frequent words in CREA are low frequency words
data
=
l
.
strip
()
.
split
()
crea
[
data
[
1
]]
=
float
(
data
[
2
]
.
replace
(
','
,
''
))
self
.
lang_word_freqs
=
crea
if
lang
==
'en'
:
self
.
nlp
=
spacy
.
load
(
"en_core_web_sm"
)
# Para leer el texto que introducimos
f
=
open
(
crea_total_path
)
lines
=
f
.
readlines
()
f
.
close
()
crea
=
{}
for
l
in
lines
[
1
:
1000
]:
# those words not in the 1000 most frequent words in CREA are low frequency words
data
=
l
.
strip
()
.
split
()
crea
[
data
[
1
]]
=
float
(
data
[
2
]
.
replace
(
','
,
''
))
self
.
crea
=
crea
pass
self
.
lang_word_freqs
=
{}
def
textProcessing
(
self
,
text
):
def
read
(
self
,
text
):
# Meter todas las funciones en una patron de los tokens válidos
doc
=
self
.
nlp
(
text
)
self
.
tokens
=
[
w
for
w
in
doc
]
...
...
@@ -43,9 +43,45 @@ class TextComplexitySpacy():
ws
=
self
.
nlp
(
sentence
.
text
)
pos_content_sentences
.
append
([
w
for
w
in
ws
if
re
.
match
(
'NOUN.*|VERB.*|ADJ.*'
,
w
.
pos_
)])
self
.
pos_content_sentences
=
pos_content_sentences
return
self
.
pos_content_sentences
def
get_all_metrics
(
self
):
self
.
punctuationMarks
()
self
.
lexicalComplexity
()
self
.
ssReadability
()
self
.
sentenceComplexity
()
self
.
autoReadability
()
self
.
embeddingDepth
()
self
.
readability
()
self
.
ageReadability
()
self
.
yearsCrawford
()
metrics
=
{
'npunct'
:
self
.
npunctuation
,
# number of punctuation marks
'nword'
:
self
.
N_words
,
# number of non punctiation tokens (words)
'ILFW'
:
self
.
ILFW
,
# index of low frequency words
'LDI'
:
self
.
LDI
,
# lexical diversity index
'LC'
:
self
.
LC
,
# lexical complexity index
'nrword'
:
self
.
N_rw
,
# number of rare words
'SSR'
:
self
.
SSR
,
# Spaulding's readability score
'avgsentl'
:
self
.
ASL
,
# average sentences length
'ncompsent'
:
self
.
N_cs
,
# number of complex sentences (those with composed verbs)
'nsent'
:
self
.
N_s
,
# number of sentences
'SCI'
:
self
.
SCI
,
# sentence complexity index
'nchar'
:
self
.
N_charac
,
# number of characters
'ARI'
:
self
.
ARI
,
# auto readability index
'min_depth'
:
self
.
min_max_list
,
# minimum of maximum tree depths
'max_depth'
:
self
.
max_max_list
,
# maximum of maximum tree depths
'mean_depth'
:
self
.
mean_max_list
,
# mean of maximum tree depths
'nsyllab'
:
self
.
n_syllables
,
# number of syllables
'huerta'
:
self
.
huertareadability
,
# Huerta's readability
'IFSZ'
:
self
.
ifszreadability
,
# Flesch-Szigrist legibility
'polini'
:
self
.
polinicompressibility
,
# Polini's compressibility
'mu'
:
self
.
mureadability
,
# Mu readability
'minage'
:
self
.
minimumage
,
# minimum age
'SOL'
:
self
.
solreadability
,
# SOL readability
'crawford'
:
self
.
years
# Crawford's years
}
return
metrics
def
punctuationMarks
(
self
):
# Solo nos interesa contar los tokens que sean signo de puntuación.
...
...
@@ -108,6 +144,9 @@ class TextComplexitySpacy():
return
self
.
N_lfw
,
self
.
N_cw
,
self
.
N_dcw
,
self
.
N_s
,
self
.
LDI
,
self
.
ILFW
,
self
.
LC
def
ssReadability
(
self
):
'''
Spaulding Score of Readability
'''
#Number of rare words
byfreq
=
sorted
(
self
.
crea
,
key
=
self
.
crea
.
__getitem__
,
reverse
=
True
)
byfreq
=
byfreq
[:
1500
]
...
...
@@ -202,7 +241,7 @@ class TextComplexitySpacy():
return
self
.
max_max_list
,
self
.
min_max_list
,
self
.
mean_max_list
def
syllable_counter_spanish
(
self
,
text
):
def
syllable_counter_spanish
(
self
,
text
):
if
self
.
lang
==
'es'
:
t
=
re
.
sub
(
r'y([aáeéiíoóuú])'
,
'
\\
1'
,
text
.
lower
())
t
=
re
.
sub
(
r'[aáeéioóu][iuy]'
,
'A'
,
t
.
lower
())
...
...
@@ -265,7 +304,7 @@ class TextComplexitySpacy():
self
.
mureadability
=
mureadability
return
self
.
n_syllables
,
self
.
n_syllables3
,
self
.
nletters
,
self
.
huertareadability
,
self
.
ifszreadability
,
self
.
polinicompressibility
,
self
.
mureadability
,
self
.
syll_words
,
self
.
words_sen
def
ageReadability
(
self
):
minimumage
=
0.2495
*
(
self
.
N_words
/
self
.
nsentences
)
+
6.4763
*
(
self
.
n_syllables
/
self
.
N_words
)
-
7.1395
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment