Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Arturo Montejo Ráez
/
text-complexity
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
c3a3133a
authored
Jan 24, 2022
by
Alba Maria Mármol
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
Add TextComplexitySpacy
parent
1e209b7e
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
309 additions
and
0 deletions
TextComplexitySpacy.py
TextComplexitySpacy.py
0 → 100644
View file @
c3a3133a
import
spacy
as
sp
import
spacy.cli
spacy
.
cli
.
download
(
"es_core_news_sm"
)
import
es_core_news_sm
import
os
import
re
from
functools
import
reduce
import
numpy
as
np
import
scipy.stats
import
math
class
TextComplexitySpacy
():
def
__init__
(
self
,
lang
=
'es'
):
self
.
lang
=
lang
# Create language analyzer
self
.
nlp
=
es_core_news_sm
.
load
()
# To read the text we enter
f
=
open
(
'./CREA_total.txt'
)
lines
=
f
.
readlines
()
f
.
close
()
crea
=
{}
for
l
in
lines
[
1
:
1000
]:
# Those words not in the 1000 most frequent words in CREA are low frequency words
data
=
l
.
strip
()
.
split
()
crea
[
data
[
1
]]
=
float
(
data
[
2
]
.
replace
(
','
,
''
))
self
.
crea
=
crea
pass
def
textProcessing
(
self
,
text
):
# Put all functions in a pattern of valid tokens
doc
=
self
.
nlp
(
text
)
self
.
tokens
=
[
w
for
w
in
doc
]
self
.
sentences
=
[
sent
for
sent
in
doc
.
sents
]
self
.
nsentences
=
len
(
self
.
sentences
)
print
(
'Las oraciones: '
,
self
.
sentences
)
'''
Filter out tokens that are not adjectives, verbs or nouns.
'''
pos_content_sentences
=
[]
for
sentence
in
self
.
sentences
:
ws
=
self
.
nlp
(
sentence
.
text
)
pos_content_sentences
.
append
([
w
for
w
in
ws
if
re
.
match
(
'NOUN.*|VERB.*|ADJ.*'
,
w
.
pos_
)])
self
.
pos_content_sentences
=
pos_content_sentences
return
self
.
pos_content_sentences
def
punctuationMarks
(
self
):
# We are only interested in counting tokens that are punctuation marks.
# Number of words.
punctuation
=
[]
N_words
=
[]
for
w
in
self
.
tokens
:
if
re
.
match
(
'PUNCT.*'
,
w
.
pos_
):
punctuation
.
append
(
w
.
text
)
else
:
N_words
.
append
(
w
.
text
)
print
(
'Las palabras del texto son : '
,
N_words
)
aux
=
len
(
N_words
)
if
aux
==
0
:
aux
=
1
self
.
N_words
=
aux
print
(
'Number of words (N_words): '
,
self
.
N_words
,
'
\n
'
)
self
.
npunctuation
=
len
(
punctuation
)
self
.
punctuation
=
punctuation
print
(
"PUNCTUATION MARKS = "
,
self
.
npunctuation
,
'
\n
'
)
return
self
.
npunctuation
,
self
.
punctuation
,
self
.
N_words
def
lexicalComplexity
(
self
):
# Number of low frequency words
count
=
0
for
sentence
in
self
.
pos_content_sentences
:
for
w
in
sentence
:
if
w
.
text
not
in
self
.
crea
:
count
+=
1
N_lfw
=
count
self
.
N_lfw
=
N_lfw
print
(
"Number of low frequency words (N_lfw): "
,
self
.
N_lfw
,
"
\n
"
)
# Number of distinct content words
N_dcw
=
len
(
set
([
w
.
text
.
lower
()
for
s
in
self
.
pos_content_sentences
for
w
in
s
]))
self
.
N_dcw
=
N_dcw
print
(
'Number of distinct content words (N_dcw) = '
,
self
.
N_dcw
,
'
\n
'
)
# Number of sentences
self
.
N_s
=
len
(
self
.
pos_content_sentences
)
print
(
"Number os sentences (N_s): "
,
self
.
N_s
,
"
\n
"
)
# Number of total content words
N_cw
=
reduce
((
lambda
x
,
y
:
x
+
y
),
[
len
(
s
)
for
s
in
self
.
pos_content_sentences
])
self
.
N_cw
=
N_cw
print
(
"Number of total content words (N_cw): "
,
self
.
N_cw
,
"
\n
"
)
# Lexical Distribution Index
if
self
.
N_s
==
0
:
self
.
N_s
=
1
LDI
=
N_dcw
/
float
(
self
.
N_s
)
self
.
LDI
=
LDI
print
(
"Lexical Distribution Index (LDI) = "
,
self
.
LDI
,
'
\n
'
)
# Index of Low Frequency Words
if
N_cw
==
0
:
N_cw
=
1
ILFW
=
N_lfw
/
float
(
N_cw
)
self
.
ILFW
=
ILFW
print
(
"Index Low Frequency Words (ILFW) = "
,
self
.
ILFW
,
'
\n
'
)
# Lexical Complexity
LC
=
(
LDI
+
ILFW
)
/
2
self
.
LC
=
LC
print
(
"LEXICAL COMPLEXITY INDEX (LC) ="
,
LC
,
"
\n
"
)
return
self
.
N_lfw
,
self
.
N_cw
,
self
.
N_dcw
,
self
.
N_s
,
self
.
LDI
,
self
.
ILFW
,
self
.
LC
def
ssReadability
(
self
):
# Number of rare words
byfreq
=
sorted
(
self
.
crea
,
key
=
self
.
crea
.
__getitem__
,
reverse
=
True
)
byfreq
=
byfreq
[:
1500
]
count
=
0
for
sentence
in
self
.
pos_content_sentences
:
for
w
in
sentence
:
if
w
.
text
.
lower
()
not
in
byfreq
:
count
+=
1
N_rw
=
count
self
.
N_rw
=
N_rw
print
(
"Number of rare words (N_rw): "
,
self
.
N_rw
,
"
\n
"
)
SSR
=
1.609
*
(
self
.
N_words
/
self
.
N_s
)
+
331.8
*
(
self
.
N_rw
/
self
.
N_words
)
+
22.0
self
.
SSR
=
SSR
print
(
"SPAULDING SPANISH READABILITY (SSR) "
,
self
.
SSR
,
"
\n
"
)
return
self
.
N_rw
,
self
.
SSR
def
sentenceComplexity
(
self
):
# Number of complex sentences
N_cs
=
0
for
sentence
in
self
.
sentences
:
previous_is_verb
=
False
count
=
0
for
w
in
sentence
:
if
re
.
match
(
'VERB.*'
,
w
.
pos_
):
if
(
previous_is_verb
):
count
+=
1
previous_is_verb
=
False
else
:
previous_is_verb
=
True
else
:
previous_is_verb
=
False
if
count
>
0
:
N_cs
+=
1
self
.
N_cs
=
N_cs
print
(
"Number of complex sentences: "
,
self
.
N_cs
,
"
\n
"
)
ASL
=
self
.
N_words
/
self
.
N_s
self
.
ASL
=
ASL
print
(
"Average Sentence Length (ASL) = "
,
self
.
ASL
,
'
\n
'
)
CS
=
self
.
N_cs
/
self
.
N_s
self
.
CS
=
CS
print
(
"Complex Sentences (CS) = "
,
self
.
CS
,
'
\n
'
)
SCI
=
(
ASL
+
CS
)
/
2
self
.
SCI
=
SCI
print
(
"SENTENCE COMPLEXITY INDEX:(SCI) = "
,
self
.
SCI
,
"
\n
"
)
return
self
.
N_cs
,
self
.
ASL
,
self
.
CS
,
self
.
SCI
def
autoReadability
(
self
):
# Number of characters
count
=
0
listwords
=
[]
for
words
in
self
.
sentences
:
for
w
in
words
:
if
re
.
match
(
'
\r\n
.*'
,
w
.
text
):
count
+=
1
else
:
listwords
.
append
(
w
.
text
)
self
.
listwords
=
listwords
N_charac
=
0
for
characters
in
self
.
listwords
:
N_charac
+=
len
(
characters
)
self
.
N_charac
=
N_charac
print
(
"Number of characters: "
,
self
.
N_charac
,
"
\n
"
)
ARI
=
4.71
*
self
.
N_charac
/
self
.
N_words
+
0.5
*
self
.
N_words
/
self
.
N_s
-
21.43
self
.
ARI
=
ARI
print
(
"AUTOMATED READABILITY INDEX (ARI) = "
,
self
.
ARI
,
'
\n
'
)
return
self
.
N_charac
,
self
.
ARI
,
self
.
listwords
def
tree_height
(
self
,
root
):
if
not
list
(
root
.
children
):
return
1
else
:
return
1
+
max
(
self
.
tree_height
(
x
)
for
x
in
root
.
children
)
def
embeddingDepth
(
self
):
## output results
roots
=
[
sent
.
root
for
sent
in
self
.
sentences
]
max_list
=
[]
max_list
=
[
self
.
tree_height
(
root
)
for
root
in
roots
]
mean_max_list
=
sum
(
max_list
)
/
(
len
(
max_list
))
max_max_list
=
max
(
max_list
)
min_max_list
=
min
(
max_list
)
print
(
'MAXIMUN EMBEDDING DEPTH OF SENTENCE (MaxDEPTH): '
,
max_max_list
,
'
\n
'
)
print
(
'MINIMUN EMBEDDING DEPTH OF SENTENCE (MinDEPTH): '
,
min_max_list
,
'
\n
'
)
print
(
'AVERAGE EMBEDDING DEPTH OF SENTENCE (MeanDEPTH): '
,
mean_max_list
,
'
\n
'
)
self
.
max_max_list
=
max_max_list
self
.
min_max_list
=
min_max_list
self
.
mean_max_list
=
mean_max_list
return
self
.
max_max_list
,
self
.
min_max_list
,
self
.
mean_max_list
def
syllable_counter_spanish
(
self
,
text
):
t
=
re
.
sub
(
r'y([aáeéiíoóuú])'
,
'
\\
1'
,
text
.
lower
())
t
=
re
.
sub
(
r'[aáeéioóu][iuy]'
,
'A'
,
t
.
lower
())
t
=
re
.
sub
(
r'[iu][aáeyéioóu]'
,
'A'
,
t
)
.
lower
()
t
=
re
.
sub
(
r'[aáeéiíoóuúy]'
,
'A'
,
t
)
return
(
len
(
t
.
split
(
'A'
))
-
1
)
def
readability
(
self
):
# Number of syllables and Number of words with 3 or more syllables:tagger
n_syllables
=
0
n_syllables3
=
0
for
words
in
self
.
listwords
:
syllables
=
self
.
syllable_counter_spanish
(
words
)
n_syllables
+=
syllables
if
syllables
>=
3
:
n_syllables3
+=
1
self
.
n_syllables
=
n_syllables
self
.
n_syllables3
=
n_syllables3
# Number of letters
nletters
=
0
letters
=
[]
vecletters
=
[]
for
word
in
self
.
listwords
:
if
re
.
match
(
'[a-zA-Z]|á|ó|í|ú|é'
,
word
):
letters
.
append
(
word
)
nletters
+=
len
(
word
)
vecletters
.
append
(
len
(
word
))
self
.
letters
=
letters
self
.
nletters
=
nletters
self
.
vecletters
=
vecletters
huertareadability
=
206.835
-
60
*
(
self
.
n_syllables
/
self
.
N_words
)
-
102
*
(
self
.
nsentences
/
self
.
N_words
)
print
(
"THE READABILITY OF HUERTA: "
,
huertareadability
,
"
\n
"
)
self
.
huertareadability
=
huertareadability
ifszreadability
=
206.835
-
62.3
*
(
self
.
n_syllables
/
self
.
N_words
)
-
(
self
.
N_words
/
self
.
nsentences
)
print
(
"THE READABILITY IFSZ: "
,
ifszreadability
,
"
\n
"
)
self
.
ifszreadability
=
ifszreadability
self
.
syll_words
=
self
.
n_syllables
/
self
.
N_words
polinicompressibility
=
95.2
-
9.7
*
(
self
.
nletters
/
self
.
N_words
)
-
0.35
*
(
self
.
N_words
/
self
.
nsentences
)
print
(
"THE COMPRESSIBILITY OF GUTIÉRREZ POLINI: "
,
polinicompressibility
,
"
\n
"
)
self
.
polinicompressibility
=
polinicompressibility
self
.
words_sen
=
self
.
N_words
/
self
.
nsentences
x
=
self
.
nletters
/
self
.
N_words
varianza
=
np
.
var
(
self
.
vecletters
)
if
varianza
==
0
:
varianza
=
1
aux
=
self
.
N_words
-
1
if
aux
==
0
:
aux
=
1
mureadability
=
(
self
.
N_words
/
aux
)
*
(
x
/
varianza
)
*
100
print
(
"READABILITY MU: "
,
mureadability
,
"
\n
"
)
self
.
mureadability
=
mureadability
return
self
.
n_syllables
,
self
.
n_syllables3
,
self
.
nletters
,
self
.
huertareadability
,
self
.
ifszreadability
,
self
.
polinicompressibility
,
self
.
mureadability
,
self
.
syll_words
,
self
.
words_sen
def
ageReadability
(
self
):
minimumage
=
0.2495
*
(
self
.
N_words
/
self
.
nsentences
)
+
6.4763
*
(
self
.
n_syllables
/
self
.
N_words
)
-
7.1395
print
(
"MINIMUM AGE TO UNDERSTAND A TEXT: "
,
minimumage
,
"
\n
"
)
self
.
minimumage
=
minimumage
solreadability
=
-
2.51
+
0.74
*
(
3.1291
+
1.0430
*
math
.
sqrt
(
self
.
n_syllables3
*
(
30
/
self
.
nsentences
)))
print
(
"THE READABILITY SOL: "
,
solreadability
,
"
\n
"
)
self
.
solreadability
=
solreadability
return
self
.
minimumage
,
self
.
solreadability
def
yearsCrawford
(
self
):
years
=
-
20.5
*
(
self
.
nsentences
/
self
.
N_words
)
+
4.9
*
(
self
.
n_syllables
/
self
.
N_words
)
-
3.407
print
(
"YEARS NEEDED: "
,
years
,
"
\n
"
)
self
.
years
=
years
return
self
.
years
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment