Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SINAI
/
clef-pan2018
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
8f96e893
authored
May 12, 2018
by
Arturo Montejo Ráez
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
classify_comp working on tira.io
parent
1df9d9e2
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
295 additions
and
136 deletions
.gitignore
ComplexityEnglish.py
ComplexityFrench.py
ComplexityItalian.py
ComplexityLanguage.py
ComplexitySpanish.py
classify_comp
classify_comp.py
.gitignore
0 → 100644
View file @
8f96e893
__pycache__
ComplexityEnglish.py
View file @
8f96e893
import
sys
sys
.
path
.
append
(
'/home/garciacumbreras18/dist/freeling/APIs/python'
)
from
ComplexityLanguage
import
ComplexityLanguage
from
ComplexityLanguage
import
ComplexityLanguage
import
re
import
re
import
math
import
math
...
@@ -179,4 +181,4 @@ class ComplexityEnglish(ComplexityLanguage):
...
@@ -179,4 +181,4 @@ class ComplexityEnglish(ComplexityLanguage):
return
metrics
return
metrics
\ No newline at end of file
ComplexityFrench.py
View file @
8f96e893
import
sys
sys
.
path
.
append
(
'/home/garciacumbreras18/dist/freeling/APIs/python'
)
from
ComplexityLanguage
import
ComplexityLanguage
from
ComplexityLanguage
import
ComplexityLanguage
import
freeling
import
freeling
import
os
import
os
...
@@ -15,9 +17,9 @@ class ComplexityFrench(ComplexityLanguage):
...
@@ -15,9 +17,9 @@ class ComplexityFrench(ComplexityLanguage):
ComplexityLanguage
.
__init__
(
self
,
lang
)
ComplexityLanguage
.
__init__
(
self
,
lang
)
## Modify this line to be your FreeLing installation directory
## Modify this line to be your FreeLing installation directory
FREELINGDIR
=
"/
usr/local
"
FREELINGDIR
=
"/
home/garciacumbreras18/dist/freeling
"
DATA
=
FREELINGDIR
+
"/
share/freeling
/"
DATA
=
FREELINGDIR
+
"/
data
/"
CLASSDIR
=
"
/home/sinai/Experiments/CLEF-PAN/
"
CLASSDIR
=
""
self
.
lang
=
lang
self
.
lang
=
lang
freeling
.
util_init_locale
(
"default"
)
freeling
.
util_init_locale
(
"default"
)
...
@@ -50,12 +52,8 @@ class ComplexityFrench(ComplexityLanguage):
...
@@ -50,12 +52,8 @@ class ComplexityFrench(ComplexityLanguage):
self
.
tg
=
freeling
.
hmm_tagger
(
DATA
+
lang
+
"/tagger.dat"
,
True
,
2
)
self
.
tg
=
freeling
.
hmm_tagger
(
DATA
+
lang
+
"/tagger.dat"
,
True
,
2
)
self
.
sen
=
freeling
.
senses
(
DATA
+
lang
+
"/senses.dat"
)
self
.
sen
=
freeling
.
senses
(
DATA
+
lang
+
"/senses.dat"
)
#Listas de palabras de Dale-Chall
CLASSDIR
=
"/home/sinai/Experiments/CLEF-PAN/"
f
=
open
(
CLASSDIR
+
'DaleChall.txt'
)
f
=
open
(
CLASSDIR
+
'
/home/garciacumbreras18/
DaleChall.txt'
)
lines
=
f
.
readlines
()
lines
=
f
.
readlines
()
f
.
close
()
f
.
close
()
...
@@ -143,4 +141,4 @@ class ComplexityFrench(ComplexityLanguage):
...
@@ -143,4 +141,4 @@ class ComplexityFrench(ComplexityLanguage):
return
metrics
return
metrics
\ No newline at end of file
ComplexityItalian.py
View file @
8f96e893
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import
sys
sys
.
path
.
append
(
'/home/garciacumbreras18/dist/freeling/APIs/python'
)
import
freeling
import
freeling
import
os
import
os
import
re
import
re
...
@@ -12,9 +14,9 @@ class ComplexityItalian():
...
@@ -12,9 +14,9 @@ class ComplexityItalian():
def
__init__
(
self
,
lang
=
'it'
):
def
__init__
(
self
,
lang
=
'it'
):
## Modify this line to be your FreeLing installation directory
## Modify this line to be your FreeLing installation directory
FREELINGDIR
=
"/
usr/local
"
FREELINGDIR
=
"/
home/garciacumbreras18/dist/freeling
"
DATA
=
FREELINGDIR
+
"/
share/freeling
/"
DATA
=
FREELINGDIR
+
"/
data
/"
CLASSDIR
=
"/home/sinai/Experiments/CLEF-PAN/"
self
.
DATA
=
DATA
self
.
DATA
=
DATA
self
.
lang
=
lang
self
.
lang
=
lang
freeling
.
util_init_locale
(
"default"
)
freeling
.
util_init_locale
(
"default"
)
...
@@ -279,4 +281,4 @@ class ComplexityItalian():
...
@@ -279,4 +281,4 @@ class ComplexityItalian():
\ No newline at end of file
ComplexityLanguage.py
View file @
8f96e893
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import
sys
sys
.
path
.
append
(
'/home/garciacumbreras18/dist/freeling/APIs/python'
)
import
freeling
import
freeling
import
os
import
os
import
re
import
re
...
@@ -8,14 +12,14 @@ import scipy.stats
...
@@ -8,14 +12,14 @@ import scipy.stats
import
math
import
math
class
ComplexityLanguage
():
class
ComplexityLanguage
():
def
__init__
(
self
,
lang
):
def
__init__
(
self
,
lang
):
## Modify this line to be your FreeLing installation directory
## Modify this line to be your FreeLing installation directory
FREELINGDIR
=
"/
usr/local
"
FREELINGDIR
=
"/
home/garciacumbreras18/dist/freeling
"
DATA
=
FREELINGDIR
+
"/
share/freeling
/"
DATA
=
FREELINGDIR
+
"/
data
/"
self
.
DATA
=
DATA
self
.
DATA
=
DATA
self
.
lang
=
lang
self
.
lang
=
lang
freeling
.
util_init_locale
(
"default"
)
freeling
.
util_init_locale
(
"default"
)
...
@@ -24,49 +28,49 @@ class ComplexityLanguage():
...
@@ -24,49 +28,49 @@ class ComplexityLanguage():
# create options set for maco analyzer. Default values are Ok, except for data files.
# create options set for maco analyzer. Default values are Ok, except for data files.
op
=
freeling
.
maco_options
(
lang
)
op
=
freeling
.
maco_options
(
lang
)
op
.
set_data_files
(
""
,
op
.
set_data_files
(
""
,
self
.
DATA
+
"common/punct.dat"
,
self
.
DATA
+
"common/punct.dat"
,
self
.
DATA
+
self
.
lang
+
"/dicc.src"
,
self
.
DATA
+
self
.
lang
+
"/dicc.src"
,
self
.
DATA
+
self
.
lang
+
"/afixos.dat"
,
self
.
DATA
+
self
.
lang
+
"/afixos.dat"
,
""
,
""
,
self
.
DATA
+
self
.
lang
+
"/locucions.dat"
,
self
.
DATA
+
self
.
lang
+
"/locucions.dat"
,
self
.
DATA
+
self
.
lang
+
"/np.dat"
,
self
.
DATA
+
self
.
lang
+
"/np.dat"
,
self
.
DATA
+
self
.
lang
+
"/quantities.dat"
,
self
.
DATA
+
self
.
lang
+
"/quantities.dat"
,
self
.
DATA
+
self
.
lang
+
"/probabilitats.dat"
)
self
.
DATA
+
self
.
lang
+
"/probabilitats.dat"
)
# create analyzers
# create analyzers
self
.
tk
=
freeling
.
tokenizer
(
self
.
DATA
+
self
.
lang
+
"/tokenizer.dat"
)
self
.
tk
=
freeling
.
tokenizer
(
self
.
DATA
+
self
.
lang
+
"/tokenizer.dat"
)
#self.sp=freeling.splitter("/home/sinai/Freeling/data/"+self.lang+"/splitter.dat")
#self.sp=freeling.splitter("/home/sinai/Freeling/data/"+self.lang+"/splitter.dat")
self
.
sp
=
freeling
.
splitter
(
self
.
DATA
+
self
.
lang
+
"/splitter.dat"
)
self
.
sp
=
freeling
.
splitter
(
self
.
DATA
+
self
.
lang
+
"/splitter.dat"
)
self
.
mf
=
freeling
.
maco
(
op
)
self
.
mf
=
freeling
.
maco
(
op
)
# activate mmorpho modules to be used in next call
# activate mmorpho modules to be used in next call
self
.
mf
.
set_active_options
(
False
,
True
,
True
,
True
,
# select which among created
self
.
mf
.
set_active_options
(
False
,
True
,
True
,
True
,
# select which among created
True
,
True
,
False
,
True
,
# submodules are to be used.
True
,
True
,
False
,
True
,
# submodules are to be used.
True
,
True
,
True
,
True
)
# default: all created submodules are used
True
,
True
,
True
,
True
)
# default: all created submodules are used
# create tagger, sense anotator, and parsers
# create tagger, sense anotator, and parsers
self
.
tg
=
freeling
.
hmm_tagger
(
self
.
DATA
+
self
.
lang
+
"/tagger.dat"
,
True
,
2
)
self
.
tg
=
freeling
.
hmm_tagger
(
self
.
DATA
+
self
.
lang
+
"/tagger.dat"
,
True
,
2
)
self
.
sen
=
freeling
.
senses
(
self
.
DATA
+
self
.
lang
+
"/senses.dat"
)
self
.
sen
=
freeling
.
senses
(
self
.
DATA
+
self
.
lang
+
"/senses.dat"
)
#self.parser= freeling.chart_parser(DATA+lang+"/chunker/grammar-chunk.dat")
#self.parser= freeling.chart_parser(DATA+lang+"/chunker/grammar-chunk.dat")
#self.dep=freeling.dep_txala(DATA+lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol())
#self.dep=freeling.dep_txala(DATA+lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol())
"""
"""
config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
config = [
config = [
True|False, # PUNCTUATION MARKS
True|False, # PUNCTUATION MARKS
True|False, # SCI
True|False, # SCI
True|False, # ARI
True|False, # ARI
True|False, # MU
True|False, # MU
]
]
Si config == None se calculan todas las métricas de complejidad soportadas
Si config == None se calculan todas las métricas de complejidad soportadas
"""
"""
self
.
config
=
[
True
,
True
,
True
,
True
]
self
.
config
=
[
True
,
True
,
True
,
True
]
self
.
metricsStr
=
[
'AVERAGE PUNCTUATION MARKS'
,
'SCI'
,
'ARI'
,
'MU'
]
self
.
metricsStr
=
[
'AVERAGE PUNCTUATION MARKS'
,
'SCI'
,
'ARI'
,
'MU'
]
pass
pass
def
textProcessing
(
self
,
text
):
def
textProcessing
(
self
,
text
):
text
=
text
.
replace
(
u'
\xa0
'
,
u' '
)
.
replace
(
'"'
,
''
)
text
=
text
.
replace
(
u'
\xa0
'
,
u' '
)
.
replace
(
'"'
,
''
)
# meter todas las funciones en una patron de los tokens válidos
# meter todas las funciones en una patron de los tokens válidos
...
@@ -86,9 +90,9 @@ class ComplexityLanguage():
...
@@ -86,9 +90,9 @@ class ComplexityLanguage():
#ls = self.dep.analyze(ls)
#ls = self.dep.analyze(ls)
#print("After dependencies", len(ls))
#print("After dependencies", len(ls))
self
.
sentences
=
ls
self
.
sentences
=
ls
self
.
N_sentences
=
len
(
ls
)
self
.
N_sentences
=
len
(
ls
)
self
.
sp
.
close_session
(
sid
)
self
.
sp
.
close_session
(
sid
)
#print('Las oraciones: ', self.sentences)
#print('Las oraciones: ', self.sentences)
'''
'''
Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos
Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos
...
@@ -97,11 +101,11 @@ class ComplexityLanguage():
...
@@ -97,11 +101,11 @@ class ComplexityLanguage():
for
sentence
in
self
.
sentences
:
for
sentence
in
self
.
sentences
:
ws
=
sentence
.
get_words
();
ws
=
sentence
.
get_words
();
pos_content_sentences
.
append
([
w
for
w
in
ws
if
re
.
match
(
'N.*|V.*|A.*'
,
w
.
get_tag
())])
pos_content_sentences
.
append
([
w
for
w
in
ws
if
re
.
match
(
'N.*|V.*|A.*'
,
w
.
get_tag
())])
self
.
pos_content_sentences
=
pos_content_sentences
self
.
pos_content_sentences
=
pos_content_sentences
return
self
.
pos_content_sentences
,
self
.
sentences
,
self
.
N_sentences
return
self
.
pos_content_sentences
,
self
.
sentences
,
self
.
N_sentences
def
punctuationMarks
(
self
):
def
punctuationMarks
(
self
):
#Solo nos interesa contar los tokens que sean signo de puntuación.
#Solo nos interesa contar los tokens que sean signo de puntuación.
#Number of words.
#Number of words.
...
@@ -114,10 +118,10 @@ class ComplexityLanguage():
...
@@ -114,10 +118,10 @@ class ComplexityLanguage():
else
:
else
:
lwords
.
append
(
w
.
get_form
())
lwords
.
append
(
w
.
get_form
())
self
.
N_words
=
len
(
lwords
)
self
.
N_words
=
len
(
lwords
)
#print('Number of words (N_w): ', self.N_words, '\n' )
#print('Number of words (N_w): ', self.N_words, '\n' )
self
.
N_punctuation
=
len
(
punctuation
)
self
.
N_punctuation
=
len
(
punctuation
)
self
.
punctuation
=
punctuation
self
.
punctuation
=
punctuation
...
@@ -125,14 +129,14 @@ class ComplexityLanguage():
...
@@ -125,14 +129,14 @@ class ComplexityLanguage():
punctuation_over_words
=
0
punctuation_over_words
=
0
else
:
else
:
punctuation_over_words
=
self
.
N_punctuation
/
self
.
N_words
punctuation_over_words
=
self
.
N_punctuation
/
self
.
N_words
self
.
punctuation_over_words
=
punctuation_over_words
self
.
punctuation_over_words
=
punctuation_over_words
#print("PUNCTUATION MARKS = ", self.N_punctuation,'\n')
#print("PUNCTUATION MARKS = ", self.N_punctuation,'\n')
return
self
.
punctuation_over_words
,
self
.
N_punctuation
,
self
.
punctuation
,
self
.
N_words
return
self
.
punctuation_over_words
,
self
.
N_punctuation
,
self
.
punctuation
,
self
.
N_words
def
sentenceComplexity
(
self
):
def
sentenceComplexity
(
self
):
#Number of complex sentences
#Number of complex sentences
N_cs
=
0
N_cs
=
0
for
sentence
in
self
.
sentences
:
for
sentence
in
self
.
sentences
:
...
@@ -149,25 +153,25 @@ class ComplexityLanguage():
...
@@ -149,25 +153,25 @@ class ComplexityLanguage():
else
:
else
:
previous_is_verb
=
False
previous_is_verb
=
False
if
count
>
0
:
if
count
>
0
:
N_cs
+=
1
N_cs
+=
1
self
.
N_cs
=
N_cs
self
.
N_cs
=
N_cs
#print("Number of complex sentences: ", self.N_cs, "\n")
#print("Number of complex sentences: ", self.N_cs, "\n")
ASL
=
self
.
N_words
/
self
.
N_sentences
ASL
=
self
.
N_words
/
self
.
N_sentences
self
.
ASL
=
ASL
self
.
ASL
=
ASL
#print("Average Sentence Length (ASL) = ", self.ASL, '\n')
#print("Average Sentence Length (ASL) = ", self.ASL, '\n')
CS
=
self
.
N_cs
/
self
.
N_sentences
CS
=
self
.
N_cs
/
self
.
N_sentences
self
.
CS
=
CS
self
.
CS
=
CS
#print("Complex Sentences (CS) = ", self.CS, '\n')
#print("Complex Sentences (CS) = ", self.CS, '\n')
SCI
=
(
ASL
+
CS
)
/
2
SCI
=
(
ASL
+
CS
)
/
2
self
.
SCI
=
SCI
self
.
SCI
=
SCI
#print("SENTENCE COMPLEXITY INDEX:(SCI) = ", self.SCI, "\n")
#print("SENTENCE COMPLEXITY INDEX:(SCI) = ", self.SCI, "\n")
return
self
.
SCI
,
self
.
CS
,
self
.
N_cs
,
self
.
ASL
return
self
.
SCI
,
self
.
CS
,
self
.
N_cs
,
self
.
ASL
def
autoReadability
(
self
):
def
autoReadability
(
self
):
#Number of characters
#Number of characters
count
=
0
count
=
0
...
@@ -178,24 +182,24 @@ class ComplexityLanguage():
...
@@ -178,24 +182,24 @@ class ComplexityLanguage():
count
+=
1
count
+=
1
else
:
else
:
listwords
.
append
(
w
.
get_form
())
listwords
.
append
(
w
.
get_form
())
self
.
listwords
=
listwords
self
.
listwords
=
listwords
N_charac
=
0
N_charac
=
0
for
characters
in
self
.
listwords
:
for
characters
in
self
.
listwords
:
N_charac
+=
len
(
characters
)
N_charac
+=
len
(
characters
)
self
.
N_charac
=
N_charac
self
.
N_charac
=
N_charac
#print("Number of characters: ", self.N_charac, "\n")
#print("Number of characters: ", self.N_charac, "\n")
ARI
=
4.71
*
self
.
N_charac
/
self
.
N_words
+
0.5
*
self
.
N_words
/
self
.
N_sentences
-
21.43
ARI
=
4.71
*
self
.
N_charac
/
self
.
N_words
+
0.5
*
self
.
N_words
/
self
.
N_sentences
-
21.43
self
.
ARI
=
ARI
self
.
ARI
=
ARI
#print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
#print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
return
self
.
ARI
,
self
.
N_charac
,
self
.
listwords
return
self
.
ARI
,
self
.
N_charac
,
self
.
listwords
def
mureadability
(
self
):
def
mureadability
(
self
):
#Number of syllables and Number of words with 3 or more syllables:tagger
#Number of syllables and Number of words with 3 or more syllables:tagger
N_syllables
=
0
N_syllables
=
0
N_syllables3
=
0
N_syllables3
=
0
...
@@ -207,10 +211,10 @@ class ComplexityLanguage():
...
@@ -207,10 +211,10 @@ class ComplexityLanguage():
count
+=
1
count
+=
1
if
count
>=
3
:
if
count
>=
3
:
N_syllables3
+=
1
N_syllables3
+=
1
self
.
N_syllables
=
N_syllables
self
.
N_syllables
=
N_syllables
self
.
N_syllables3
=
N_syllables3
self
.
N_syllables3
=
N_syllables3
#Number of letters
#Number of letters
N_letters
=
0
N_letters
=
0
letters
=
[]
letters
=
[]
...
@@ -220,33 +224,33 @@ class ComplexityLanguage():
...
@@ -220,33 +224,33 @@ class ComplexityLanguage():
letters
.
append
(
word
)
letters
.
append
(
word
)
N_letters
+=
len
(
word
)
N_letters
+=
len
(
word
)
vecletters
.
append
(
len
(
word
))
vecletters
.
append
(
len
(
word
))
self
.
letters
=
letters
self
.
letters
=
letters
self
.
N_letters
=
N_letters
self
.
N_letters
=
N_letters
self
.
vecletters
=
vecletters
self
.
vecletters
=
vecletters
x
=
self
.
N_letters
/
self
.
N_words
x
=
self
.
N_letters
/
self
.
N_words
varianza
=
np
.
var
(
self
.
vecletters
)
varianza
=
np
.
var
(
self
.
vecletters
)
mu
=
(
self
.
N_words
/
(
self
.
N_words
-
1
))
*
(
x
/
varianza
)
*
100
mu
=
(
self
.
N_words
/
(
self
.
N_words
-
1
))
*
(
x
/
varianza
)
*
100
#print("READABILITY MU: ", mu, "\n")
#print("READABILITY MU: ", mu, "\n")
self
.
mu
=
mu
self
.
mu
=
mu
return
self
.
mu
,
self
.
N_syllables
,
self
.
N_syllables3
,
self
.
letters
,
self
.
N_letters
,
self
.
vecletters
return
self
.
mu
,
self
.
N_syllables
,
self
.
N_syllables3
,
self
.
letters
,
self
.
N_letters
,
self
.
vecletters
def
calcMetrics
(
self
,
text
):
def
calcMetrics
(
self
,
text
):
"""
"""
Calcula la métricas de complejidad activadas en la configuración
Calcula la métricas de complejidad activadas en la configuración
"""
"""
self
.
textProcessing
(
text
)
self
.
textProcessing
(
text
)
metrics
=
{}
metrics
=
{}
punctuationMarks
=
None
punctuationMarks
=
None
autoreadability
=
None
autoreadability
=
None
sentencecomplexity
=
None
sentencecomplexity
=
None
for
i
in
range
(
0
,
len
(
self
.
metricsStr
)):
for
i
in
range
(
0
,
len
(
self
.
metricsStr
)):
if
self
.
config
==
None
or
self
.
config
[
i
]
and
self
.
metricsStr
[
i
]
==
'AVERAGE PUNCTUATION MARKS'
:
if
self
.
config
==
None
or
self
.
config
[
i
]
and
self
.
metricsStr
[
i
]
==
'AVERAGE PUNCTUATION MARKS'
:
punctuationmarks
=
self
.
punctuationMarks
()
punctuationmarks
=
self
.
punctuationMarks
()
metrics
[
'AVERAGE PUNCTUATION MARKS'
]
=
punctuationmarks
[
0
]
metrics
[
'AVERAGE PUNCTUATION MARKS'
]
=
punctuationmarks
[
0
]
...
@@ -259,9 +263,9 @@ class ComplexityLanguage():
...
@@ -259,9 +263,9 @@ class ComplexityLanguage():
if
self
.
config
==
None
or
self
.
config
[
i
]
and
self
.
metricsStr
[
i
]
==
'MU'
:
if
self
.
config
==
None
or
self
.
config
[
i
]
and
self
.
metricsStr
[
i
]
==
'MU'
:
mureadability
=
self
.
mureadability
()
mureadability
=
self
.
mureadability
()
metrics
[
'MU'
]
=
mureadability
[
0
]
metrics
[
'MU'
]
=
mureadability
[
0
]
return
metrics
return
metrics
def
getPOS
(
self
,
text
):
def
getPOS
(
self
,
text
):
self
.
textProcessing
(
text
)
self
.
textProcessing
(
text
)
pos_sentences
=
[]
pos_sentences
=
[]
...
@@ -270,6 +274,5 @@ class ComplexityLanguage():
...
@@ -270,6 +274,5 @@ class ComplexityLanguage():
pos_sentences
.
append
([
w
.
get_tag
()
for
w
in
ws
])
pos_sentences
.
append
([
w
.
get_tag
()
for
w
in
ws
])
#print('POS',pos_sentences)
#print('POS',pos_sentences)
self
.
pos_sentences
=
pos_sentences
self
.
pos_sentences
=
pos_sentences
return
self
.
pos_sentences
return
self
.
pos_sentences
\ No newline at end of file
ComplexitySpanish.py
View file @
8f96e893
...
@@ -17,7 +17,7 @@ class ComplexitySpanish(ComplexityLanguage):
...
@@ -17,7 +17,7 @@ class ComplexitySpanish(ComplexityLanguage):
self
.
dep
=
freeling
.
dep_txala
(
self
.
DATA
+
self
.
lang
+
"/dep_txala/dependences.dat"
,
self
.
parser
.
get_start_symbol
())
self
.
dep
=
freeling
.
dep_txala
(
self
.
DATA
+
self
.
lang
+
"/dep_txala/dependences.dat"
,
self
.
parser
.
get_start_symbol
())
# Para leer el texto que introducimos
# Para leer el texto que introducimos
CLASSDIR
=
"/home/
sinai/Experiments/CLEF-PAN
/"
CLASSDIR
=
"/home/
garciacumbreras18
/"
f
=
open
(
CLASSDIR
+
'CREA_total.txt'
)
f
=
open
(
CLASSDIR
+
'CREA_total.txt'
)
lines
=
f
.
readlines
()
lines
=
f
.
readlines
()
...
@@ -300,4 +300,4 @@ class ComplexitySpanish(ComplexityLanguage):
...
@@ -300,4 +300,4 @@ class ComplexitySpanish(ComplexityLanguage):
metrics
[
'CRAWFORD'
]
=
self
.
yearsCrawford
()
metrics
[
'CRAWFORD'
]
=
self
.
yearsCrawford
()
return
metrics
return
metrics
\ No newline at end of file
classify_comp
deleted
100755 → 0
View file @
1df9d9e2
#!/usr/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
# Authors:
# Rocío López-Anguita (rlanguit@ujaen.es)
# Arturo Montejo-Ráez (amontejo@ujaen.es)
# Centro de Estudios Avanzados en TIC (CEATIC)
# Universidad de Jaén
# 2018
import
json
import
os
from
ComplexityLanguage
import
ComplexityLanguage
from
ComplexitySpanish
import
ComplexitySpanish
from
ComplexityEnglish
import
ComplexityEnglish
from
ComplexityFrench
import
ComplexityFrench
from
ComplexityPolish
import
ComplexityPolish
from
ComplexityItalian
import
ComplexityItalian
import
pandas
as
pd
import
numpy
as
np
import
matplotlib.pyplot
as
plt
from
mpl_toolkits.mplot3d
import
Axes3D
from
sklearn.decomposition
import
PCA
from
sklearn
import
preprocessing
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
'PAN2018 author identificator based on text complexity metrics'
)
parser
.
add_argument
(
'-i'
,
type
=
string
,
help
=
'input directory'
)
parser
.
add_argument
(
'-o'
,
type
=
string
,
help
=
'output directory'
)
args
=
parser
.
parse_args
()
print
(
args
.
i
,
args
.
o
)
exit
()
print
(
'Loading complexity analyzers for different languages...
\n
'
,
flush
=
True
)
mlComplexityText
=
{
'en'
:
ComplexityEnglish
(),
'sp'
:
ComplexitySpanish
(),
'fr'
:
ComplexityFrench
(),
'pl'
:
ComplexityPolish
(),
'it'
:
ComplexityItalian
()
}
INPUT_DIR
=
args
.
i
OUTPUT_DIR
=
args
.
o
with
open
(
INPUT_DIR
+
'/collection-info.json'
,
'r'
)
as
f
:
collectionInfo
=
json
.
load
(
f
)
f
.
close
()
print
(
type
(
collectionInfo
))
\ No newline at end of file
classify_comp.py
0 → 100755
View file @
8f96e893
#!/home/garciacumbreras18/anaconda3/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
###############################################################################
# Authors:
# Rocío López-Anguita (rlanguit@ujaen.es)
# Arturo Montejo-Ráez (amontejo@ujaen.es)
# Centro de Estudios Avanzados en TIC (CEATIC)
#
# Universidad de Jaén - 2018
###############################################################################
import
json
import
os
from
ComplexityLanguage
import
ComplexityLanguage
from
ComplexitySpanish
import
ComplexitySpanish
from
ComplexityEnglish
import
ComplexityEnglish
from
ComplexityFrench
import
ComplexityFrench
from
ComplexityPolish
import
ComplexityPolish
from
ComplexityItalian
import
ComplexityItalian
import
pandas
as
pd
import
numpy
as
np
import
matplotlib.pyplot
as
plt
from
mpl_toolkits.mplot3d
import
Axes3D
from
sklearn.decomposition
import
PCA
from
sklearn
import
preprocessing
import
argparse
## ----------------------------------------------------------------------------
##
## Read command lines arguments
##
parser
=
argparse
.
ArgumentParser
(
description
=
'PAN2018 author identificator based on text complexity metrics'
)
parser
.
add_argument
(
'-i'
,
type
=
str
,
help
=
'input directory'
)
parser
.
add_argument
(
'-o'
,
type
=
str
,
help
=
'output directory'
)
args
=
parser
.
parse_args
()
INPUT_DIR
,
OUTPUT_DIR
=
args
.
i
,
args
.
o
## ----------------------------------------------------------------------------
##
## Load of analyzers
##
print
(
'Loading complexity analyzers for different languages...
\n
'
,
flush
=
True
)
mlComplexityText
=
{
'en'
:
ComplexityEnglish
(),
'sp'
:
ComplexitySpanish
(),
'fr'
:
ComplexityFrench
(),
'pl'
:
ComplexityPolish
(),
'it'
:
ComplexityItalian
()
}
## ----------------------------------------------------------------------------
##
## Corpus loading (both, train and test data sets)
##
complexity_known
=
pd
.
DataFrame
()
complexity_unknown
=
pd
.
DataFrame
()
labels
=
{}
labels_cand
=
[]
#
# Recorremos todos los problemas
#
print
(
'Loading collection-info.json file from'
,
args
.
i
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/collection-info.json'
,
'r'
)
as
f
:
collectionInfo
=
json
.
load
(
f
)
f
.
close
()
for
problem
in
collectionInfo
:
print
(
'
\n\n
Problem: '
,
problem
[
'problem-name'
],
flush
=
True
)
print
(
'Language: '
,
problem
[
'language'
],
flush
=
True
)
#
# Cargamos la clase para el cálculo de la complejidad del idioma correspondiente
#
complexityText
=
mlComplexityText
[
problem
[
'language'
]]
#
# Recorremos todos los candidatos
#
print
(
"Loading problem data...
\n
"
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/problem-info.json'
,
'r'
)
as
problem_info_fhd
:
problem_info
=
json
.
load
(
problem_info_fhd
)
problem_info_fhd
.
close
()
#
# Leemos los textos de autoría conocida (TEXTOS DE ENTRENAMIENTO)
#
print
(
"Loading training data"
)
for
candidate
in
problem_info
[
'candidate-authors'
]:
print
(
'Candidate: '
,
candidate
[
'author-name'
],
flush
=
True
)
files
=
os
.
listdir
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
candidate
[
'author-name'
]))
probcand
=
problem
[
'problem-name'
]
+
candidate
[
'author-name'
]
if
not
probcand
in
labels
:
labels
[
probcand
]
=
len
(
labels
)
labels_cand
+=
[
probcand
]
#
# Procesamos todo los textos de ese candidato
#
for
i
,
nameFile
in
enumerate
(
files
):
print
(
'Reading text file: '
,
nameFile
,
flush
=
True
)
with
open
(
os
.
path
.
join
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
candidate
[
'author-name'
]),
nameFile
),
'r'
)
as
context
:
calcmetrics
=
complexityText
.
calcMetrics
(
context
.
read
())
dfi
=
pd
.
DataFrame
(
calcmetrics
,
index
=
[
i
])
dfi
[
'problem'
]
=
problem
[
'problem-name'
]
dfi
[
'language'
]
=
problem
[
'language'
]
dfi
[
'candidate'
]
=
candidate
[
'author-name'
]
dfi
[
'label'
]
=
labels
[
probcand
]
dfi
[
'filename'
]
=
nameFile
complexity_known
=
complexity_known
.
append
([
dfi
])
#
# Si existe ground-truth, lo leemos para conocer los candidatos
#
unknown_candidates
=
False
if
os
.
path
.
isfile
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/ground-truth.json'
):
print
(
"Reading ground truth..."
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/ground-truth.json'
,
'r'
)
as
ground_truth_fhd
:
ground_truth
=
json
.
load
(
ground_truth_fhd
)
ground_truth_fhd
.
close
()
unknown_candidates
=
{}
for
item
in
ground_truth
[
'ground_truth'
]:
unknown_candidates
[
item
[
'unknown-text'
]]
=
item
[
'true-author'
]
#
# Recorremos archivos sin etiquetar (TEXTOS DE TEST)
#
print
(
"Loading test data"
,
flush
=
True
)
for
i
,
unknown_file
in
enumerate
(
os
.
listdir
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
problem_info
[
'unknown-folder'
]))):
print
(
"Analyzing file"
,
unknown_file
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/'
+
problem_info
[
'unknown-folder'
]
+
'/'
+
unknown_file
,
'r'
)
as
unknown_fhd
:
calcmetrics
=
complexityText
.
calcMetrics
(
unknown_fhd
.
read
())
unknown_fhd
.
close
()
dfi
=
pd
.
DataFrame
(
calcmetrics
,
index
=
[
i
])
dfi
[
'problem'
]
=
problem
[
'problem-name'
]
dfi
[
'language'
]
=
problem
[
'language'
]
if
unknown_candidates
and
unknown_candidates
[
unknown_file
]:
probcand
=
problem
[
'problem-name'
]
+
unknown_candidates
[
unknown_file
]
dfi
[
'candidate'
]
=
unknown_candidates
[
unknown_file
]
dfi
[
'label'
]
=
labels
[
probcand
]
else
:
dfi
[
'candidate'
]
=
None
dfi
[
'label'
]
=
None
dfi
[
'filename'
]
=
unknown_file
complexity_unknown
=
complexity_unknown
.
append
([
dfi
])
## ----------------------------------------------------------------------------
##
## Training and classification
##
if
not
os
.
path
.
exists
(
OUTPUT_DIR
):
os
.
makedirs
(
OUTPUT_DIR
)
from
sklearn
import
svm
clf
=
svm
.
LinearSVC
(
C
=
1
)
for
problem
in
set
(
complexity_known
[
'problem'
]):
answers
=
[]
print
(
'------- Training and classifying '
,
problem
,
flush
=
True
)
#
# Para el train cogemos los textos conocidos
#
train
=
complexity_known
.
loc
[
complexity_known
[
'problem'
]
==
problem
]
train
=
train
.
dropna
(
axis
=
1
,
how
=
'any'
)
train_target
=
train
[
'label'
]
train_data
=
train
.
drop
([
'problem'
,
'language'
,
'candidate'
,
'filename'
,
'label'
],
axis
=
1
)
train_data
=
pd
.
DataFrame
(
preprocessing
.
normalize
(
train_data
,
norm
=
'l2'
))
#
# Para el test cogemos los textos desconocidos
#
test
=
complexity_unknown
.
loc
[
complexity_unknown
[
'problem'
]
==
problem
]
test
=
test
.
dropna
(
axis
=
1
,
how
=
'any'
)
test_target
=
test
[
'label'
]
test_data
=
test
.
drop
([
'problem'
,
'language'
,
'candidate'
,
'filename'
,
'label'
],
axis
=
1
)
test_data
=
pd
.
DataFrame
(
preprocessing
.
normalize
(
test_data
,
norm
=
'l2'
))
#Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred
=
clf
.
fit
(
train_data
,
train_target
)
.
predict
(
test_data
)
for
index
,
row
in
test
.
iterrows
():
probcand
=
labels_cand
[
y_pred
[
index
]]
answers
.
append
({
'unknown-text'
:
row
[
'filename'
],
'predicted-author'
:
probcand
[
probcand
.
find
(
"candidate"
):],
})
with
open
(
OUTPUT_DIR
+
'/answers-'
+
problem
+
'.json'
,
'w'
)
as
file
:
json
.
dump
(
answers
,
file
,
indent
=
4
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment