Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SINAI
/
clef-pan2018
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
94131f3d
authored
May 17, 2018
by
Arturo Montejo Ráez
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
new version
parent
8f96e893
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
237 additions
and
1 deletions
ComplexityPolish.py
classify_comp.py
classify_postf.py
ComplexityPolish.py
View file @
94131f3d
import
sys
sys
.
path
.
append
(
'/home/garciacumbreras18/dist/treetagger'
)
import
nltk
import
re
from
treetagger
import
TreeTagger
class
ComplexityPolish
():
...
...
@@ -177,3 +181,20 @@ class ComplexityPolish():
return
metrics
def
getPOS
(
self
,
text
):
tt
=
TreeTagger
(
language
=
'polish'
)
sentences
=
tt
.
tag
(
text
)
pos_sentences
=
[]
sent
=
[]
for
w
in
sentences
:
tag
=
w
[
1
]
.
split
(
':'
)[
0
]
if
tag
==
'SENT'
:
pos_sentences
.
append
(
sent
)
sent
=
[]
else
:
sent
+=
[
tag
]
self
.
pos_sentences
=
pos_sentences
return
self
.
pos_sentences
classify_comp.py
View file @
94131f3d
...
...
@@ -192,7 +192,7 @@ for problem in set(complexity_known['problem']):
test_data
=
test
.
drop
([
'problem'
,
'language'
,
'candidate'
,
'filename'
,
'label'
],
axis
=
1
)
test_data
=
pd
.
DataFrame
(
preprocessing
.
normalize
(
test_data
,
norm
=
'l2'
))
#Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
#
Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred
=
clf
.
fit
(
train_data
,
train_target
)
.
predict
(
test_data
)
for
index
,
row
in
test
.
iterrows
():
...
...
classify_postf.py
0 → 100755
View file @
94131f3d
#!/home/garciacumbreras18/anaconda3/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
###############################################################################
# Authors:
# Rocío López-Anguita (rlanguit@ujaen.es)
# Arturo Montejo-Ráez (amontejo@ujaen.es)
# Centro de Estudios Avanzados en TIC (CEATIC)
#
# Universidad de Jaén - 2018
###############################################################################
import
json
import
os
from
ComplexityLanguage
import
ComplexityLanguage
from
ComplexitySpanish
import
ComplexitySpanish
from
ComplexityEnglish
import
ComplexityEnglish
from
ComplexityFrench
import
ComplexityFrench
from
ComplexityPolish
import
ComplexityPolish
from
ComplexityItalian
import
ComplexityItalian
import
pandas
as
pd
import
numpy
as
np
from
sklearn
import
preprocessing
from
sklearn.feature_extraction.text
import
TfidfVectorizer
import
argparse
## ----------------------------------------------------------------------------
##
## Read command line arguments
##
parser
=
argparse
.
ArgumentParser
(
description
=
'PAN2018 author identificator based on POS vectors'
)
parser
.
add_argument
(
'-i'
,
'--input'
,
type
=
str
,
help
=
'input directory'
)
parser
.
add_argument
(
'-o'
,
'--output'
,
type
=
str
,
help
=
'output directory'
)
parser
.
add_argument
(
'-n'
,
'--ngramsize'
,
type
=
int
,
help
=
'maximum n-gram size'
,
choices
=
[
1
,
2
,
3
],
default
=
2
)
parser
.
add_argument
(
'-f'
,
'--idf'
,
action
=
'store_true'
,
help
=
'apply inverse document frequency'
,
default
=
False
)
args
=
parser
.
parse_args
()
INPUT_DIR
,
OUTPUT_DIR
=
args
.
input
,
args
.
output
## ----------------------------------------------------------------------------
##
## Load of analyzers
##
print
(
'Loading complexity analyzers for different languages...
\n
'
,
flush
=
True
)
mlComplexityText
=
{
'en'
:
ComplexityEnglish
(),
'sp'
:
ComplexitySpanish
(),
'fr'
:
ComplexityFrench
(),
'pl'
:
ComplexityPolish
(),
'it'
:
ComplexityItalian
()
}
## ----------------------------------------------------------------------------
##
## Corpus loading (both, train and test data sets)
##
postf
=
pd
.
DataFrame
()
labels
=
{}
labels_cand
=
[]
#
# Recorremos todos los problemas
#
print
(
'Loading collection-info.json file from'
,
INPUT_DIR
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/collection-info.json'
,
'r'
)
as
f
:
collectionInfo
=
json
.
load
(
f
)
f
.
close
()
for
problem
in
collectionInfo
:
print
(
'
\n\n
Problem: '
,
problem
[
'problem-name'
],
flush
=
True
)
print
(
'Language: '
,
problem
[
'language'
],
flush
=
True
)
#
# Cargamos la clase para el cálculo de la complejidad del idioma correspondiente
#
complexityText
=
mlComplexityText
[
problem
[
'language'
]]
#
# Recorremos todos los candidatos
#
print
(
"Loading problem data...
\n
"
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/problem-info.json'
,
'r'
)
as
problem_info_fhd
:
problem_info
=
json
.
load
(
problem_info_fhd
)
problem_info_fhd
.
close
()
#
# Leemos los textos de autoría conocida (TEXTOS DE ENTRENAMIENTO)
#
print
(
"Loading training data"
)
for
candidate
in
problem_info
[
'candidate-authors'
]:
print
(
'Candidate: '
,
candidate
[
'author-name'
],
flush
=
True
)
files
=
os
.
listdir
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
candidate
[
'author-name'
]))
probcand
=
problem
[
'problem-name'
]
+
candidate
[
'author-name'
]
if
not
probcand
in
labels
:
labels
[
probcand
]
=
len
(
labels
)
labels_cand
+=
[
probcand
]
#
# Procesamos todo los textos de ese candidato
#
for
i
,
nameFile
in
enumerate
(
files
):
print
(
'Reading text file: '
,
nameFile
,
flush
=
True
)
with
open
(
os
.
path
.
join
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
candidate
[
'author-name'
]),
nameFile
),
'r'
)
as
fhnd
:
postags
=
complexityText
.
getPOS
(
fhnd
.
read
())
fhnd
.
close
()
postags
=
" "
.
join
([
" "
.
join
(
p
)
for
p
in
postags
])
dfi
=
pd
.
DataFrame
({
'Pos'
:
postags
},
index
=
[
i
])
dfi
[
'problem'
]
=
problem
[
'problem-name'
]
dfi
[
'language'
]
=
problem
[
'language'
]
dfi
[
'candidate'
]
=
candidate
[
'author-name'
]
dfi
[
'label'
]
=
labels
[
probcand
]
dfi
[
'filename'
]
=
nameFile
postf
=
postf
.
append
([
dfi
])
#
# Si existe ground-truth, lo leemos para conocer los candidatos
#
unknown_candidates
=
False
if
os
.
path
.
isfile
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/ground-truth.json'
):
print
(
"Reading ground truth..."
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/ground-truth.json'
,
'r'
)
as
fhnd
:
ground_truth
=
json
.
load
(
fhnd
)
fhnd
.
close
()
unknown_candidates
=
{}
for
item
in
ground_truth
[
'ground_truth'
]:
unknown_candidates
[
item
[
'unknown-text'
]]
=
item
[
'true-author'
]
#
# Recorremos archivos sin etiquetar (TEXTOS DE TEST)
#
print
(
"Loading test data"
,
flush
=
True
)
for
i
,
unknown_file
in
enumerate
(
os
.
listdir
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
problem_info
[
'unknown-folder'
]))):
print
(
"Analyzing file"
,
unknown_file
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/'
+
problem_info
[
'unknown-folder'
]
+
'/'
+
unknown_file
,
'r'
)
as
fhnd
:
postags
=
complexityText
.
getPOS
(
fhnd
.
read
())
fhnd
.
close
()
postags
=
" "
.
join
([
" "
.
join
(
p
)
for
p
in
postags
])
dfi
=
pd
.
DataFrame
({
'Pos'
:
postags
},
index
=
[
i
])
dfi
[
'problem'
]
=
problem
[
'problem-name'
]
dfi
[
'language'
]
=
problem
[
'language'
]
if
unknown_candidates
and
unknown_candidates
[
unknown_file
]:
probcand
=
problem
[
'problem-name'
]
+
unknown_candidates
[
unknown_file
]
dfi
[
'candidate'
]
=
unknown_candidates
[
unknown_file
]
dfi
[
'label'
]
=
labels
[
probcand
]
else
:
dfi
[
'candidate'
]
=
None
dfi
[
'label'
]
=
None
dfi
[
'filename'
]
=
unknown_file
postf
=
postf
.
append
([
dfi
])
## ----------------------------------------------------------------------------
##
## Training and classification
##
if
not
os
.
path
.
exists
(
OUTPUT_DIR
):
os
.
makedirs
(
OUTPUT_DIR
)
from
sklearn
import
svm
clf
=
svm
.
LinearSVC
(
C
=
1
)
for
problem
in
set
(
postf
[
'problem'
]):
answers
=
[]
print
(
'------- Training and classifying '
,
problem
,
flush
=
True
)
#
# Calculamos el modelo de espacio vectorial
#
tfidfVectorizer
=
TfidfVectorizer
(
ngram_range
=
(
1
,
args
.
ngramsize
),
use_idf
=
args
.
idf
,
norm
=
'l2'
)
postf
[
'POStfidf'
]
=
list
(
tfidfVectorizer
.
fit_transform
(
postf
[
'Pos'
]))
#
# Para el train cogemos los textos conocidos
#
train
=
postf
[
postf
[
'filename'
]
.
str
.
contains
(
r"\bknown"
,
regex
=
True
)]
train
=
train
.
loc
[
train
[
'problem'
]
==
problem
]
train
=
train
.
dropna
(
axis
=
1
,
how
=
'any'
)
train_target
=
train
[
'label'
]
train_data
=
np
.
array
(
list
(
train
[
'POStfidf'
]
.
apply
(
lambda
x
:
x
.
toarray
()[
0
])))
#
# Para el test cogemos los textos desconocidos
#
test
=
postf
[
postf
[
'filename'
]
.
str
.
contains
(
r"\bunknown"
,
regex
=
True
)]
test
=
test
.
loc
[
test
[
'problem'
]
==
problem
]
test
=
test
.
dropna
(
axis
=
1
,
how
=
'any'
)
test_target
=
test
[
'label'
]
test_data
=
np
.
array
(
list
(
test
[
'POStfidf'
]
.
apply
(
lambda
x
:
x
.
toarray
()[
0
])))
# Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred
=
clf
.
fit
(
train_data
,
train_target
)
.
predict
(
test_data
)
for
index
,
row
in
test
.
iterrows
():
probcand
=
labels_cand
[
y_pred
[
index
]]
answers
.
append
({
'unknown-text'
:
row
[
'filename'
],
'predicted-author'
:
probcand
[
probcand
.
find
(
"candidate"
):],
})
with
open
(
OUTPUT_DIR
+
'/answers-'
+
problem
+
'.json'
,
'w'
)
as
file
:
json
.
dump
(
answers
,
file
,
indent
=
4
)
print
(
"done!"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment