Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SINAI
/
clef-pan2018
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
94131f3d
authored
May 17, 2018
by
Arturo Montejo Ráez
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
new version
parent
8f96e893
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
269 additions
and
34 deletions
ComplexityPolish.py
classify_comp.py
classify_postf.py
ComplexityPolish.py
View file @
94131f3d
import
nltk
import
sys
sys
.
path
.
append
(
'/home/garciacumbreras18/dist/treetagger'
)
import
nltk
import
re
import
re
from
treetagger
import
TreeTagger
class
ComplexityPolish
():
class
ComplexityPolish
():
def
__init__
(
self
,
lang
=
'pl'
):
def
__init__
(
self
,
lang
=
'pl'
):
"""
"""
config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
config = [
config = [
True|False, # PUNCTUATION MARKS
True|False, # PUNCTUATION MARKS
True|False, # ARI
True|False, # ARI
True|False, # FOG
True|False, # FOG
True|False, # FLESCH
True|False, # FLESCH
True|False, # FLESCH-KINCAID
True|False, # FLESCH-KINCAID
True|False, # PISAREK
True|False, # PISAREK
]
]
Si config == None se calculan todas las métricas de complejidad soportadas
Si config == None se calculan todas las métricas de complejidad soportadas
"""
"""
self
.
config
=
[
True
,
True
,
True
,
True
,
True
,
True
]
self
.
config
=
[
True
,
True
,
True
,
True
,
True
,
True
]
self
.
metricsStr
=
[
'AVERAGE PUNCTUATION MARKS'
,
'ARI'
,
'FOG'
,
'FLESCH'
,
'FLESCH-KINCAID'
,
'PISAREK'
]
self
.
metricsStr
=
[
'AVERAGE PUNCTUATION MARKS'
,
'ARI'
,
'FOG'
,
'FLESCH'
,
'FLESCH-KINCAID'
,
'PISAREK'
]
pass
pass
def
textProcessing
(
self
,
text
):
def
textProcessing
(
self
,
text
):
text
=
text
.
replace
(
u'
\xa0
'
,
u' '
)
text
=
text
.
replace
(
u'
\xa0
'
,
u' '
)
'''
'''
...
@@ -41,7 +45,7 @@ class ComplexityPolish():
...
@@ -41,7 +45,7 @@ class ComplexityPolish():
N_text_tokens
=
len
(
self
.
text_tokens
)
N_text_tokens
=
len
(
self
.
text_tokens
)
self
.
N_text_tokens
=
N_text_tokens
self
.
N_text_tokens
=
N_text_tokens
#print('Tokens: ', self.N_text_tokens)
#print('Tokens: ', self.N_text_tokens)
# y ahora reorganizamos las oraciones a partir de los puntos aislados
# y ahora reorganizamos las oraciones a partir de los puntos aislados
sentences
=
[]
sentences
=
[]
ini
=
0
ini
=
0
...
@@ -60,8 +64,8 @@ class ComplexityPolish():
...
@@ -60,8 +64,8 @@ class ComplexityPolish():
N_sentences
=
len
(
sentences
)
N_sentences
=
len
(
sentences
)
self
.
N_sentences
=
N_sentences
self
.
N_sentences
=
N_sentences
#print('Sentences: ',self.sentences)
#print('Sentences: ',self.sentences)
N_charac
=
0
N_charac
=
0
for
word
in
self
.
text_tokens
:
for
word
in
self
.
text_tokens
:
...
@@ -79,15 +83,15 @@ class ComplexityPolish():
...
@@ -79,15 +83,15 @@ class ComplexityPolish():
count
+=
1
count
+=
1
if
count
>=
3
:
if
count
>=
3
:
N_syllables3
+=
1
N_syllables3
+=
1
self
.
N_syllables
=
N_syllables
self
.
N_syllables
=
N_syllables
self
.
N_syllables3
=
N_syllables3
self
.
N_syllables3
=
N_syllables3
#print('The number of syllables is: ',self.N_syllables)
#print('The number of syllables is: ',self.N_syllables)
#print('The number of syllables3 is: ', self.N_syllables3)
#print('The number of syllables3 is: ', self.N_syllables3)
return
self
.
text_tokens
,
self
.
N_text_tokens
,
self
.
sentences
,
self
.
N_sentences
,
self
.
N_charac
,
self
.
N_syllables
,
self
.
N_syllables3
return
self
.
text_tokens
,
self
.
N_text_tokens
,
self
.
sentences
,
self
.
N_sentences
,
self
.
N_charac
,
self
.
N_syllables
,
self
.
N_syllables3
def
punctuationMarks
(
self
):
def
punctuationMarks
(
self
):
N_punctuation
=
0
N_punctuation
=
0
letters
=
[]
letters
=
[]
...
@@ -96,34 +100,34 @@ class ComplexityPolish():
...
@@ -96,34 +100,34 @@ class ComplexityPolish():
if
re
.
match
(
'[a-zA-Z]|á|ó|í|ú|é'
,
word
):
if
re
.
match
(
'[a-zA-Z]|á|ó|í|ú|é'
,
word
):
letters
.
append
(
word
)
letters
.
append
(
word
)
N_letters
+=
len
(
word
)
N_letters
+=
len
(
word
)
else
:
else
:
N_punctuation
+=
1
N_punctuation
+=
1
self
.
words
=
letters
self
.
words
=
letters
self
.
N_words
=
len
(
letters
)
self
.
N_words
=
len
(
letters
)
#print('N_words: ', self.N_words)
#print('N_words: ', self.N_words)
self
.
N_letters
=
N_letters
self
.
N_letters
=
N_letters
self
.
N_punctuation
=
N_punctuation
self
.
N_punctuation
=
N_punctuation
if
self
.
N_words
==
0
:
if
self
.
N_words
==
0
:
punctuation_over_words
=
0
punctuation_over_words
=
0
else
:
else
:
punctuation_over_words
=
self
.
N_punctuation
/
self
.
N_words
punctuation_over_words
=
self
.
N_punctuation
/
self
.
N_words
self
.
punctuation_over_words
=
punctuation_over_words
self
.
punctuation_over_words
=
punctuation_over_words
#print('The number of letter is: ', N_letters)
#print('The number of letter is: ', N_letters)
#print('The list of letter is: ', letters)
#print('The list of letter is: ', letters)
#print('The PUNCTUATION MARKS is: ', self.N_punctuation, '\n')
#print('The PUNCTUATION MARKS is: ', self.N_punctuation, '\n')
return
self
.
punctuation_over_words
,
self
.
N_punctuation
,
self
.
words
,
self
.
N_words
,
self
.
N_letters
return
self
.
punctuation_over_words
,
self
.
N_punctuation
,
self
.
words
,
self
.
N_words
,
self
.
N_letters
def
readability
(
self
):
def
readability
(
self
):
ARI
=
4.71
*
self
.
N_charac
/
self
.
N_words
+
0.5
*
self
.
N_words
/
self
.
N_sentences
-
21.43
ARI
=
4.71
*
self
.
N_charac
/
self
.
N_words
+
0.5
*
self
.
N_words
/
self
.
N_sentences
-
21.43
self
.
ARI
=
ARI
self
.
ARI
=
ARI
#print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
#print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
fogreadability
=
0.4
*
(
self
.
N_words
/
self
.
N_sentences
+
100
*
self
.
N_syllables3
/
self
.
N_words
)
fogreadability
=
0.4
*
(
self
.
N_words
/
self
.
N_sentences
+
100
*
self
.
N_syllables3
/
self
.
N_words
)
self
.
fogreadability
=
fogreadability
self
.
fogreadability
=
fogreadability
#print("FOG: ", self.fogreadability, "\n")
#print("FOG: ", self.fogreadability, "\n")
...
@@ -133,29 +137,29 @@ class ComplexityPolish():
...
@@ -133,29 +137,29 @@ class ComplexityPolish():
#print("Syllables:", self.N_syllables)
#print("Syllables:", self.N_syllables)
#print("Sentences:", self.N_sentences)
#print("Sentences:", self.N_sentences)
#print("FLESCH: ", self.fleschreadability, "\n")
#print("FLESCH: ", self.fleschreadability, "\n")
fkincaidreadability
=
-
15.59
+
11.8
*
(
self
.
N_syllables
/
self
.
N_words
)
+
0.39
*
(
self
.
N_words
/
self
.
N_sentences
)
fkincaidreadability
=
-
15.59
+
11.8
*
(
self
.
N_syllables
/
self
.
N_words
)
+
0.39
*
(
self
.
N_words
/
self
.
N_sentences
)
self
.
fkincaidreadability
=
fkincaidreadability
self
.
fkincaidreadability
=
fkincaidreadability
#print("FLESCH-KINCAID: ", self.fkincaidreadability, "\n")
#print("FLESCH-KINCAID: ", self.fkincaidreadability, "\n")
self
.
fkincaidreadability
=
fkincaidreadability
self
.
fkincaidreadability
=
fkincaidreadability
pisarekreadability
=
(
self
.
N_words
/
self
.
N_sentences
)
/
3
+
self
.
N_syllables3
/
3
+
1
pisarekreadability
=
(
self
.
N_words
/
self
.
N_sentences
)
/
3
+
self
.
N_syllables3
/
3
+
1
self
.
pisarekreadability
=
pisarekreadability
self
.
pisarekreadability
=
pisarekreadability
#print("PISAREK (2007): ", self.pisarekreadability, "\n")
#print("PISAREK (2007): ", self.pisarekreadability, "\n")
return
self
.
ARI
,
self
.
fogreadability
,
self
.
fleschreadability
,
self
.
fkincaidreadability
,
self
.
pisarekreadability
return
self
.
ARI
,
self
.
fogreadability
,
self
.
fleschreadability
,
self
.
fkincaidreadability
,
self
.
pisarekreadability
def
calcMetrics
(
self
,
text
):
def
calcMetrics
(
self
,
text
):
self
.
textProcessing
(
text
)
self
.
textProcessing
(
text
)
metrics
=
{}
metrics
=
{}
metricsPo
=
self
.
metricsStr
metricsPo
=
self
.
metricsStr
readability
=
None
readability
=
None
for
i
in
range
(
0
,
len
(
metricsPo
)):
for
i
in
range
(
0
,
len
(
metricsPo
)):
if
self
.
config
==
None
or
self
.
config
[
i
]
and
metricsPo
[
i
]
==
'AVERAGE PUNCTUATION MARKS'
:
if
self
.
config
==
None
or
self
.
config
[
i
]
and
metricsPo
[
i
]
==
'AVERAGE PUNCTUATION MARKS'
:
punctuationmarks
=
self
.
punctuationMarks
()
punctuationmarks
=
self
.
punctuationMarks
()
metrics
[
'AVERAGE PUNCTUATION MARKS'
]
=
punctuationmarks
[
0
]
metrics
[
'AVERAGE PUNCTUATION MARKS'
]
=
punctuationmarks
[
0
]
...
@@ -174,6 +178,23 @@ class ComplexityPolish():
...
@@ -174,6 +178,23 @@ class ComplexityPolish():
if
self
.
config
==
None
or
self
.
config
[
i
]
and
metricsPo
[
i
]
==
'PISAREK'
:
if
self
.
config
==
None
or
self
.
config
[
i
]
and
metricsPo
[
i
]
==
'PISAREK'
:
if
not
readability
:
readability
=
self
.
readability
()
if
not
readability
:
readability
=
self
.
readability
()
metrics
[
'PISAREK'
]
=
readability
[
4
]
metrics
[
'PISAREK'
]
=
readability
[
4
]
return
metrics
return
metrics
\ No newline at end of file
def
getPOS
(
self
,
text
):
tt
=
TreeTagger
(
language
=
'polish'
)
sentences
=
tt
.
tag
(
text
)
pos_sentences
=
[]
sent
=
[]
for
w
in
sentences
:
tag
=
w
[
1
]
.
split
(
':'
)[
0
]
if
tag
==
'SENT'
:
pos_sentences
.
append
(
sent
)
sent
=
[]
else
:
sent
+=
[
tag
]
self
.
pos_sentences
=
pos_sentences
return
self
.
pos_sentences
classify_comp.py
View file @
94131f3d
...
@@ -192,7 +192,7 @@ for problem in set(complexity_known['problem']):
...
@@ -192,7 +192,7 @@ for problem in set(complexity_known['problem']):
test_data
=
test
.
drop
([
'problem'
,
'language'
,
'candidate'
,
'filename'
,
'label'
],
axis
=
1
)
test_data
=
test
.
drop
([
'problem'
,
'language'
,
'candidate'
,
'filename'
,
'label'
],
axis
=
1
)
test_data
=
pd
.
DataFrame
(
preprocessing
.
normalize
(
test_data
,
norm
=
'l2'
))
test_data
=
pd
.
DataFrame
(
preprocessing
.
normalize
(
test_data
,
norm
=
'l2'
))
#Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
#
Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred
=
clf
.
fit
(
train_data
,
train_target
)
.
predict
(
test_data
)
y_pred
=
clf
.
fit
(
train_data
,
train_target
)
.
predict
(
test_data
)
for
index
,
row
in
test
.
iterrows
():
for
index
,
row
in
test
.
iterrows
():
...
...
classify_postf.py
0 → 100755
View file @
94131f3d
#!/home/garciacumbreras18/anaconda3/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
###############################################################################
# Authors:
# Rocío López-Anguita (rlanguit@ujaen.es)
# Arturo Montejo-Ráez (amontejo@ujaen.es)
# Centro de Estudios Avanzados en TIC (CEATIC)
#
# Universidad de Jaén - 2018
###############################################################################
import
json
import
os
from
ComplexityLanguage
import
ComplexityLanguage
from
ComplexitySpanish
import
ComplexitySpanish
from
ComplexityEnglish
import
ComplexityEnglish
from
ComplexityFrench
import
ComplexityFrench
from
ComplexityPolish
import
ComplexityPolish
from
ComplexityItalian
import
ComplexityItalian
import
pandas
as
pd
import
numpy
as
np
from
sklearn
import
preprocessing
from
sklearn.feature_extraction.text
import
TfidfVectorizer
import
argparse
## ----------------------------------------------------------------------------
##
## Read command line arguments
##
parser
=
argparse
.
ArgumentParser
(
description
=
'PAN2018 author identificator based on POS vectors'
)
parser
.
add_argument
(
'-i'
,
'--input'
,
type
=
str
,
help
=
'input directory'
)
parser
.
add_argument
(
'-o'
,
'--output'
,
type
=
str
,
help
=
'output directory'
)
parser
.
add_argument
(
'-n'
,
'--ngramsize'
,
type
=
int
,
help
=
'maximum n-gram size'
,
choices
=
[
1
,
2
,
3
],
default
=
2
)
parser
.
add_argument
(
'-f'
,
'--idf'
,
action
=
'store_true'
,
help
=
'apply inverse document frequency'
,
default
=
False
)
args
=
parser
.
parse_args
()
INPUT_DIR
,
OUTPUT_DIR
=
args
.
input
,
args
.
output
## ----------------------------------------------------------------------------
##
## Load of analyzers
##
print
(
'Loading complexity analyzers for different languages...
\n
'
,
flush
=
True
)
mlComplexityText
=
{
'en'
:
ComplexityEnglish
(),
'sp'
:
ComplexitySpanish
(),
'fr'
:
ComplexityFrench
(),
'pl'
:
ComplexityPolish
(),
'it'
:
ComplexityItalian
()
}
## ----------------------------------------------------------------------------
##
## Corpus loading (both, train and test data sets)
##
postf
=
pd
.
DataFrame
()
labels
=
{}
labels_cand
=
[]
#
# Recorremos todos los problemas
#
print
(
'Loading collection-info.json file from'
,
INPUT_DIR
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/collection-info.json'
,
'r'
)
as
f
:
collectionInfo
=
json
.
load
(
f
)
f
.
close
()
for
problem
in
collectionInfo
:
print
(
'
\n\n
Problem: '
,
problem
[
'problem-name'
],
flush
=
True
)
print
(
'Language: '
,
problem
[
'language'
],
flush
=
True
)
#
# Cargamos la clase para el cálculo de la complejidad del idioma correspondiente
#
complexityText
=
mlComplexityText
[
problem
[
'language'
]]
#
# Recorremos todos los candidatos
#
print
(
"Loading problem data...
\n
"
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/problem-info.json'
,
'r'
)
as
problem_info_fhd
:
problem_info
=
json
.
load
(
problem_info_fhd
)
problem_info_fhd
.
close
()
#
# Leemos los textos de autoría conocida (TEXTOS DE ENTRENAMIENTO)
#
print
(
"Loading training data"
)
for
candidate
in
problem_info
[
'candidate-authors'
]:
print
(
'Candidate: '
,
candidate
[
'author-name'
],
flush
=
True
)
files
=
os
.
listdir
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
candidate
[
'author-name'
]))
probcand
=
problem
[
'problem-name'
]
+
candidate
[
'author-name'
]
if
not
probcand
in
labels
:
labels
[
probcand
]
=
len
(
labels
)
labels_cand
+=
[
probcand
]
#
# Procesamos todo los textos de ese candidato
#
for
i
,
nameFile
in
enumerate
(
files
):
print
(
'Reading text file: '
,
nameFile
,
flush
=
True
)
with
open
(
os
.
path
.
join
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
candidate
[
'author-name'
]),
nameFile
),
'r'
)
as
fhnd
:
postags
=
complexityText
.
getPOS
(
fhnd
.
read
())
fhnd
.
close
()
postags
=
" "
.
join
([
" "
.
join
(
p
)
for
p
in
postags
])
dfi
=
pd
.
DataFrame
({
'Pos'
:
postags
},
index
=
[
i
])
dfi
[
'problem'
]
=
problem
[
'problem-name'
]
dfi
[
'language'
]
=
problem
[
'language'
]
dfi
[
'candidate'
]
=
candidate
[
'author-name'
]
dfi
[
'label'
]
=
labels
[
probcand
]
dfi
[
'filename'
]
=
nameFile
postf
=
postf
.
append
([
dfi
])
#
# Si existe ground-truth, lo leemos para conocer los candidatos
#
unknown_candidates
=
False
if
os
.
path
.
isfile
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/ground-truth.json'
):
print
(
"Reading ground truth..."
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/ground-truth.json'
,
'r'
)
as
fhnd
:
ground_truth
=
json
.
load
(
fhnd
)
fhnd
.
close
()
unknown_candidates
=
{}
for
item
in
ground_truth
[
'ground_truth'
]:
unknown_candidates
[
item
[
'unknown-text'
]]
=
item
[
'true-author'
]
#
# Recorremos archivos sin etiquetar (TEXTOS DE TEST)
#
print
(
"Loading test data"
,
flush
=
True
)
for
i
,
unknown_file
in
enumerate
(
os
.
listdir
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
problem_info
[
'unknown-folder'
]))):
print
(
"Analyzing file"
,
unknown_file
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/'
+
problem_info
[
'unknown-folder'
]
+
'/'
+
unknown_file
,
'r'
)
as
fhnd
:
postags
=
complexityText
.
getPOS
(
fhnd
.
read
())
fhnd
.
close
()
postags
=
" "
.
join
([
" "
.
join
(
p
)
for
p
in
postags
])
dfi
=
pd
.
DataFrame
({
'Pos'
:
postags
},
index
=
[
i
])
dfi
[
'problem'
]
=
problem
[
'problem-name'
]
dfi
[
'language'
]
=
problem
[
'language'
]
if
unknown_candidates
and
unknown_candidates
[
unknown_file
]:
probcand
=
problem
[
'problem-name'
]
+
unknown_candidates
[
unknown_file
]
dfi
[
'candidate'
]
=
unknown_candidates
[
unknown_file
]
dfi
[
'label'
]
=
labels
[
probcand
]
else
:
dfi
[
'candidate'
]
=
None
dfi
[
'label'
]
=
None
dfi
[
'filename'
]
=
unknown_file
postf
=
postf
.
append
([
dfi
])
## ----------------------------------------------------------------------------
##
## Training and classification
##
if
not
os
.
path
.
exists
(
OUTPUT_DIR
):
os
.
makedirs
(
OUTPUT_DIR
)
from
sklearn
import
svm
clf
=
svm
.
LinearSVC
(
C
=
1
)
for
problem
in
set
(
postf
[
'problem'
]):
answers
=
[]
print
(
'------- Training and classifying '
,
problem
,
flush
=
True
)
#
# Calculamos el modelo de espacio vectorial
#
tfidfVectorizer
=
TfidfVectorizer
(
ngram_range
=
(
1
,
args
.
ngramsize
),
use_idf
=
args
.
idf
,
norm
=
'l2'
)
postf
[
'POStfidf'
]
=
list
(
tfidfVectorizer
.
fit_transform
(
postf
[
'Pos'
]))
#
# Para el train cogemos los textos conocidos
#
train
=
postf
[
postf
[
'filename'
]
.
str
.
contains
(
r"\bknown"
,
regex
=
True
)]
train
=
train
.
loc
[
train
[
'problem'
]
==
problem
]
train
=
train
.
dropna
(
axis
=
1
,
how
=
'any'
)
train_target
=
train
[
'label'
]
train_data
=
np
.
array
(
list
(
train
[
'POStfidf'
]
.
apply
(
lambda
x
:
x
.
toarray
()[
0
])))
#
# Para el test cogemos los textos desconocidos
#
test
=
postf
[
postf
[
'filename'
]
.
str
.
contains
(
r"\bunknown"
,
regex
=
True
)]
test
=
test
.
loc
[
test
[
'problem'
]
==
problem
]
test
=
test
.
dropna
(
axis
=
1
,
how
=
'any'
)
test_target
=
test
[
'label'
]
test_data
=
np
.
array
(
list
(
test
[
'POStfidf'
]
.
apply
(
lambda
x
:
x
.
toarray
()[
0
])))
# Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred
=
clf
.
fit
(
train_data
,
train_target
)
.
predict
(
test_data
)
for
index
,
row
in
test
.
iterrows
():
probcand
=
labels_cand
[
y_pred
[
index
]]
answers
.
append
({
'unknown-text'
:
row
[
'filename'
],
'predicted-author'
:
probcand
[
probcand
.
find
(
"candidate"
):],
})
with
open
(
OUTPUT_DIR
+
'/answers-'
+
problem
+
'.json'
,
'w'
)
as
file
:
json
.
dump
(
answers
,
file
,
indent
=
4
)
print
(
"done!"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment