Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SINAI
/
texty
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
43067364
authored
Feb 09, 2022
by
Alba Maria Mármol
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
Add new file
parent
8f206f7b
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
280 additions
and
0 deletions
TextComplexitySpacy.py
TextComplexitySpacy.py
0 → 100644
View file @
43067364
from
functools
import
reduce
import
math
import
syllables
class
TextComplexitySpacy
():
def
__init__
(
self
,
lang
=
'es'
):
self
.
lang
=
lang
# create language analyzer
if
lang
==
'es'
:
self
.
nlp
=
es_core_news_sm
.
load
()
if
lang
==
'en'
:
self
.
nlp
=
spacy
.
load
(
"en_core_web_sm"
)
# Para leer el texto que introducimos
f
=
open
(
crea_total_path
)
lines
=
f
.
readlines
()
f
.
close
()
crea
=
{}
for
l
in
lines
[
1
:
1000
]:
# those words not in the 1000 most frequent words in CREA are low frequency words
data
=
l
.
strip
()
.
split
()
crea
[
data
[
1
]]
=
float
(
data
[
2
]
.
replace
(
','
,
''
))
self
.
crea
=
crea
pass
def
textProcessing
(
self
,
text
):
# Meter todas las funciones en una patron de los tokens válidos
doc
=
self
.
nlp
(
text
)
self
.
tokens
=
[
w
for
w
in
doc
]
self
.
sentences
=
[
sent
for
sent
in
doc
.
sents
]
self
.
nsentences
=
len
(
self
.
sentences
)
'''
Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos
'''
pos_content_sentences
=
[]
for
sentence
in
self
.
sentences
:
ws
=
self
.
nlp
(
sentence
.
text
)
pos_content_sentences
.
append
([
w
for
w
in
ws
if
re
.
match
(
'NOUN.*|VERB.*|ADJ.*'
,
w
.
pos_
)])
self
.
pos_content_sentences
=
pos_content_sentences
return
self
.
pos_content_sentences
def
punctuationMarks
(
self
):
# Solo nos interesa contar los tokens que sean signo de puntuación.
# Number of words.
punctuation
=
[]
N_words
=
[]
for
w
in
self
.
tokens
:
if
re
.
match
(
'PUNCT.*'
,
w
.
pos_
):
punctuation
.
append
(
w
.
text
)
else
:
N_words
.
append
(
w
.
text
)
aux
=
len
(
N_words
)
if
aux
==
0
:
aux
=
1
self
.
N_words
=
aux
self
.
npunctuation
=
len
(
punctuation
)
self
.
punctuation
=
punctuation
return
self
.
npunctuation
,
self
.
punctuation
,
self
.
N_words
def
lexicalComplexity
(
self
):
# Number of low frequency words
count
=
0
for
sentence
in
self
.
pos_content_sentences
:
for
w
in
sentence
:
if
w
.
text
not
in
self
.
crea
:
count
+=
1
N_lfw
=
count
self
.
N_lfw
=
N_lfw
# Number of distinct content words
N_dcw
=
len
(
set
([
w
.
text
.
lower
()
for
s
in
self
.
pos_content_sentences
for
w
in
s
]))
self
.
N_dcw
=
N_dcw
# Number of sentences
self
.
N_s
=
len
(
self
.
pos_content_sentences
)
# Number of total content words
N_cw
=
reduce
((
lambda
x
,
y
:
x
+
y
),
[
len
(
s
)
for
s
in
self
.
pos_content_sentences
])
self
.
N_cw
=
N_cw
# Lexical Distribution Index
if
self
.
N_s
==
0
:
self
.
N_s
=
1
LDI
=
N_dcw
/
float
(
self
.
N_s
)
self
.
LDI
=
LDI
# Index of Low Frequency Words
if
N_cw
==
0
:
N_cw
=
1
ILFW
=
N_lfw
/
float
(
N_cw
)
self
.
ILFW
=
ILFW
# Lexical Complexity
LC
=
(
LDI
+
ILFW
)
/
2
self
.
LC
=
LC
return
self
.
N_lfw
,
self
.
N_cw
,
self
.
N_dcw
,
self
.
N_s
,
self
.
LDI
,
self
.
ILFW
,
self
.
LC
def
ssReadability
(
self
):
#Number of rare words
byfreq
=
sorted
(
self
.
crea
,
key
=
self
.
crea
.
__getitem__
,
reverse
=
True
)
byfreq
=
byfreq
[:
1500
]
count
=
0
for
sentence
in
self
.
pos_content_sentences
:
for
w
in
sentence
:
if
w
.
text
.
lower
()
not
in
byfreq
:
count
+=
1
N_rw
=
count
self
.
N_rw
=
N_rw
SSR
=
1.609
*
(
self
.
N_words
/
self
.
N_s
)
+
331.8
*
(
self
.
N_rw
/
self
.
N_words
)
+
22.0
self
.
SSR
=
SSR
return
self
.
N_rw
,
self
.
SSR
def
sentenceComplexity
(
self
):
#Number of complex sentences
N_cs
=
0
for
sentence
in
self
.
sentences
:
previous_is_verb
=
False
count
=
0
for
w
in
sentence
:
if
re
.
match
(
'VERB.*'
,
w
.
pos_
):
if
(
previous_is_verb
):
count
+=
1
previous_is_verb
=
False
else
:
previous_is_verb
=
True
else
:
previous_is_verb
=
False
if
count
>
0
:
N_cs
+=
1
self
.
N_cs
=
N_cs
ASL
=
self
.
N_words
/
self
.
N_s
self
.
ASL
=
ASL
CS
=
self
.
N_cs
/
self
.
N_s
self
.
CS
=
CS
SCI
=
(
ASL
+
CS
)
/
2
self
.
SCI
=
SCI
return
self
.
N_cs
,
self
.
ASL
,
self
.
CS
,
self
.
SCI
def
autoReadability
(
self
):
# Number of characters
count
=
0
listwords
=
[]
for
words
in
self
.
sentences
:
for
w
in
words
:
if
re
.
match
(
'
\r\n
.*'
,
w
.
text
):
count
+=
1
else
:
listwords
.
append
(
w
.
text
)
self
.
listwords
=
listwords
N_charac
=
0
for
characters
in
self
.
listwords
:
N_charac
+=
len
(
characters
)
self
.
N_charac
=
N_charac
ARI
=
4.71
*
self
.
N_charac
/
self
.
N_words
+
0.5
*
self
.
N_words
/
self
.
N_s
-
21.43
self
.
ARI
=
ARI
return
self
.
N_charac
,
self
.
ARI
,
self
.
listwords
def
tree_height
(
self
,
root
,
cont
):
if
not
list
(
root
.
children
):
return
1
else
:
cont
+=
1
if
cont
==
320
:
return
320
return
1
+
max
(
self
.
tree_height
(
x
,
cont
)
for
x
in
root
.
children
)
def
embeddingDepth
(
self
):
## Output results
roots
=
[
sent
.
root
for
sent
in
self
.
sentences
]
max_list
=
[]
max_list
=
[
self
.
tree_height
(
root
,
0
)
for
root
in
roots
]
mean_max_list
=
sum
(
max_list
)
/
(
len
(
max_list
))
max_max_list
=
max
(
max_list
)
min_max_list
=
min
(
max_list
)
self
.
max_max_list
=
max_max_list
self
.
min_max_list
=
min_max_list
self
.
mean_max_list
=
mean_max_list
return
self
.
max_max_list
,
self
.
min_max_list
,
self
.
mean_max_list
def
syllable_counter_spanish
(
self
,
text
):
if
self
.
lang
==
'es'
:
t
=
re
.
sub
(
r'y([aáeéiíoóuú])'
,
'
\\
1'
,
text
.
lower
())
t
=
re
.
sub
(
r'[aáeéioóu][iuy]'
,
'A'
,
t
.
lower
())
t
=
re
.
sub
(
r'[iu][aáeyéioóu]'
,
'A'
,
t
)
.
lower
()
t
=
re
.
sub
(
r'[aáeéiíoóuúy]'
,
'A'
,
t
)
return
(
len
(
t
.
split
(
'A'
))
-
1
)
elif
self
.
lang
==
'en'
:
return
syllables
.
estimate
(
text
)
def
readability
(
self
):
# Number of syllables and Number of words with 3 or more syllables:tagger
n_syllables
=
0
n_syllables3
=
0
for
words
in
self
.
listwords
:
syllables
=
self
.
syllable_counter_spanish
(
words
)
n_syllables
+=
syllables
if
syllables
>=
3
:
n_syllables3
+=
1
self
.
n_syllables
=
n_syllables
self
.
n_syllables3
=
n_syllables3
# Number of letters
nletters
=
0
letters
=
[]
vecletters
=
[]
for
word
in
self
.
listwords
:
if
re
.
match
(
'[a-zA-Z]|á|ó|í|ú|é'
,
word
):
letters
.
append
(
word
)
nletters
+=
len
(
word
)
vecletters
.
append
(
len
(
word
))
self
.
letters
=
letters
self
.
nletters
=
nletters
self
.
vecletters
=
vecletters
huertareadability
=
206.835
-
60
*
(
self
.
n_syllables
/
self
.
N_words
)
-
102
*
(
self
.
nsentences
/
self
.
N_words
)
self
.
huertareadability
=
huertareadability
ifszreadability
=
206.835
-
62.3
*
(
self
.
n_syllables
/
self
.
N_words
)
-
(
self
.
N_words
/
self
.
nsentences
)
self
.
ifszreadability
=
ifszreadability
self
.
syll_words
=
self
.
n_syllables
/
self
.
N_words
polinicompressibility
=
95.2
-
9.7
*
(
self
.
nletters
/
self
.
N_words
)
-
0.35
*
(
self
.
N_words
/
self
.
nsentences
)
self
.
polinicompressibility
=
polinicompressibility
self
.
words_sen
=
self
.
N_words
/
self
.
nsentences
x
=
self
.
nletters
/
self
.
N_words
varianza
=
np
.
var
(
self
.
vecletters
)
if
varianza
==
0
:
varianza
=
1
aux
=
self
.
N_words
-
1
if
aux
==
0
:
aux
=
1
mureadability
=
(
self
.
N_words
/
aux
)
*
(
x
/
varianza
)
*
100
self
.
mureadability
=
mureadability
return
self
.
n_syllables
,
self
.
n_syllables3
,
self
.
nletters
,
self
.
huertareadability
,
self
.
ifszreadability
,
self
.
polinicompressibility
,
self
.
mureadability
,
self
.
syll_words
,
self
.
words_sen
def
ageReadability
(
self
):
minimumage
=
0.2495
*
(
self
.
N_words
/
self
.
nsentences
)
+
6.4763
*
(
self
.
n_syllables
/
self
.
N_words
)
-
7.1395
self
.
minimumage
=
minimumage
solreadability
=
-
2.51
+
0.74
*
(
3.1291
+
1.0430
*
math
.
sqrt
(
self
.
n_syllables3
*
(
30
/
self
.
nsentences
)))
self
.
solreadability
=
solreadability
return
self
.
minimumage
,
self
.
solreadability
def
yearsCrawford
(
self
):
years
=
-
20.5
*
(
self
.
nsentences
/
self
.
N_words
)
+
4.9
*
(
self
.
n_syllables
/
self
.
N_words
)
-
3.407
self
.
years
=
years
return
self
.
years
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment