Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SINAI
/
clef-pan2018
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
8f96e893
authored
May 12, 2018
by
Arturo Montejo Ráez
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
classify_comp working on tira.io
parent
1df9d9e2
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
295 additions
and
136 deletions
.gitignore
ComplexityEnglish.py
ComplexityFrench.py
ComplexityItalian.py
ComplexityLanguage.py
ComplexitySpanish.py
classify_comp
classify_comp.py
.gitignore
0 → 100644
View file @
8f96e893
__pycache__
ComplexityEnglish.py
View file @
8f96e893
import
sys
sys
.
path
.
append
(
'/home/garciacumbreras18/dist/freeling/APIs/python'
)
from
ComplexityLanguage
import
ComplexityLanguage
import
re
import
math
...
...
@@ -179,4 +181,4 @@ class ComplexityEnglish(ComplexityLanguage):
return
metrics
\ No newline at end of file
ComplexityFrench.py
View file @
8f96e893
import
sys
sys
.
path
.
append
(
'/home/garciacumbreras18/dist/freeling/APIs/python'
)
from
ComplexityLanguage
import
ComplexityLanguage
import
freeling
import
os
...
...
@@ -15,9 +17,9 @@ class ComplexityFrench(ComplexityLanguage):
ComplexityLanguage
.
__init__
(
self
,
lang
)
## Modify this line to be your FreeLing installation directory
FREELINGDIR
=
"/
usr/local
"
DATA
=
FREELINGDIR
+
"/
share/freeling
/"
CLASSDIR
=
"
/home/sinai/Experiments/CLEF-PAN/
"
FREELINGDIR
=
"/
home/garciacumbreras18/dist/freeling
"
DATA
=
FREELINGDIR
+
"/
data
/"
CLASSDIR
=
""
self
.
lang
=
lang
freeling
.
util_init_locale
(
"default"
)
...
...
@@ -50,12 +52,8 @@ class ComplexityFrench(ComplexityLanguage):
self
.
tg
=
freeling
.
hmm_tagger
(
DATA
+
lang
+
"/tagger.dat"
,
True
,
2
)
self
.
sen
=
freeling
.
senses
(
DATA
+
lang
+
"/senses.dat"
)
#Listas de palabras de Dale-Chall
CLASSDIR
=
"/home/sinai/Experiments/CLEF-PAN/"
f
=
open
(
CLASSDIR
+
'DaleChall.txt'
)
f
=
open
(
CLASSDIR
+
'
/home/garciacumbreras18/
DaleChall.txt'
)
lines
=
f
.
readlines
()
f
.
close
()
...
...
@@ -143,4 +141,4 @@ class ComplexityFrench(ComplexityLanguage):
return
metrics
\ No newline at end of file
ComplexityItalian.py
View file @
8f96e893
# -*- coding: utf-8 -*-
import
sys
sys
.
path
.
append
(
'/home/garciacumbreras18/dist/freeling/APIs/python'
)
import
freeling
import
os
import
re
...
...
@@ -12,9 +14,9 @@ class ComplexityItalian():
def
__init__
(
self
,
lang
=
'it'
):
## Modify this line to be your FreeLing installation directory
FREELINGDIR
=
"/
usr/local
"
DATA
=
FREELINGDIR
+
"/
share/freeling
/"
CLASSDIR
=
"/home/sinai/Experiments/CLEF-PAN/"
FREELINGDIR
=
"/
home/garciacumbreras18/dist/freeling
"
DATA
=
FREELINGDIR
+
"/
data
/"
self
.
DATA
=
DATA
self
.
lang
=
lang
freeling
.
util_init_locale
(
"default"
)
...
...
@@ -279,4 +281,4 @@ class ComplexityItalian():
\ No newline at end of file
ComplexityLanguage.py
View file @
8f96e893
# -*- coding: utf-8 -*-
import
sys
sys
.
path
.
append
(
'/home/garciacumbreras18/dist/freeling/APIs/python'
)
import
freeling
import
os
import
re
...
...
@@ -8,14 +12,14 @@ import scipy.stats
import
math
class
ComplexityLanguage
():
def
__init__
(
self
,
lang
):
## Modify this line to be your FreeLing installation directory
FREELINGDIR
=
"/
usr/local
"
DATA
=
FREELINGDIR
+
"/
share/freeling
/"
FREELINGDIR
=
"/
home/garciacumbreras18/dist/freeling
"
DATA
=
FREELINGDIR
+
"/
data
/"
self
.
DATA
=
DATA
self
.
lang
=
lang
freeling
.
util_init_locale
(
"default"
)
...
...
@@ -24,49 +28,49 @@ class ComplexityLanguage():
# create options set for maco analyzer. Default values are Ok, except for data files.
op
=
freeling
.
maco_options
(
lang
)
op
.
set_data_files
(
""
,
op
.
set_data_files
(
""
,
self
.
DATA
+
"common/punct.dat"
,
self
.
DATA
+
self
.
lang
+
"/dicc.src"
,
self
.
DATA
+
self
.
lang
+
"/afixos.dat"
,
""
,
self
.
DATA
+
self
.
lang
+
"/locucions.dat"
,
self
.
DATA
+
self
.
lang
+
"/locucions.dat"
,
self
.
DATA
+
self
.
lang
+
"/np.dat"
,
self
.
DATA
+
self
.
lang
+
"/quantities.dat"
,
self
.
DATA
+
self
.
lang
+
"/probabilitats.dat"
)
# create analyzers
self
.
tk
=
freeling
.
tokenizer
(
self
.
DATA
+
self
.
lang
+
"/tokenizer.dat"
)
#self.sp=freeling.splitter("/home/sinai/Freeling/data/"+self.lang+"/splitter.dat")
self
.
sp
=
freeling
.
splitter
(
self
.
DATA
+
self
.
lang
+
"/splitter.dat"
)
#self.sp=freeling.splitter("/home/sinai/Freeling/data/"+self.lang+"/splitter.dat")
self
.
sp
=
freeling
.
splitter
(
self
.
DATA
+
self
.
lang
+
"/splitter.dat"
)
self
.
mf
=
freeling
.
maco
(
op
)
# activate mmorpho modules to be used in next call
self
.
mf
.
set_active_options
(
False
,
True
,
True
,
True
,
# select which among created
True
,
True
,
False
,
True
,
# submodules are to be used.
True
,
True
,
True
,
True
)
# default: all created submodules are used
self
.
mf
.
set_active_options
(
False
,
True
,
True
,
True
,
# select which among created
True
,
True
,
False
,
True
,
# submodules are to be used.
True
,
True
,
True
,
True
)
# default: all created submodules are used
# create tagger, sense anotator, and parsers
self
.
tg
=
freeling
.
hmm_tagger
(
self
.
DATA
+
self
.
lang
+
"/tagger.dat"
,
True
,
2
)
self
.
sen
=
freeling
.
senses
(
self
.
DATA
+
self
.
lang
+
"/senses.dat"
)
#self.parser= freeling.chart_parser(DATA+lang+"/chunker/grammar-chunk.dat")
#self.dep=freeling.dep_txala(DATA+lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol())
"""
"""
config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
config = [
True|False, # PUNCTUATION MARKS
True|False, # SCI
True|False, # ARI
True|False, # ARI
True|False, # MU
]
Si config == None se calculan todas las métricas de complejidad soportadas
"""
self
.
config
=
[
True
,
True
,
True
,
True
]
self
.
metricsStr
=
[
'AVERAGE PUNCTUATION MARKS'
,
'SCI'
,
'ARI'
,
'MU'
]
pass
def
textProcessing
(
self
,
text
):
text
=
text
.
replace
(
u'
\xa0
'
,
u' '
)
.
replace
(
'"'
,
''
)
# meter todas las funciones en una patron de los tokens válidos
...
...
@@ -86,9 +90,9 @@ class ComplexityLanguage():
#ls = self.dep.analyze(ls)
#print("After dependencies", len(ls))
self
.
sentences
=
ls
self
.
N_sentences
=
len
(
ls
)
self
.
N_sentences
=
len
(
ls
)
self
.
sp
.
close_session
(
sid
)
#print('Las oraciones: ', self.sentences)
'''
Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos
...
...
@@ -97,11 +101,11 @@ class ComplexityLanguage():
for
sentence
in
self
.
sentences
:
ws
=
sentence
.
get_words
();
pos_content_sentences
.
append
([
w
for
w
in
ws
if
re
.
match
(
'N.*|V.*|A.*'
,
w
.
get_tag
())])
self
.
pos_content_sentences
=
pos_content_sentences
return
self
.
pos_content_sentences
,
self
.
sentences
,
self
.
N_sentences
self
.
pos_content_sentences
=
pos_content_sentences
return
self
.
pos_content_sentences
,
self
.
sentences
,
self
.
N_sentences
def
punctuationMarks
(
self
):
#Solo nos interesa contar los tokens que sean signo de puntuación.
#Number of words.
...
...
@@ -114,10 +118,10 @@ class ComplexityLanguage():
else
:
lwords
.
append
(
w
.
get_form
())
self
.
N_words
=
len
(
lwords
)
#print('Number of words (N_w): ', self.N_words, '\n' )
self
.
N_punctuation
=
len
(
punctuation
)
self
.
punctuation
=
punctuation
...
...
@@ -125,14 +129,14 @@ class ComplexityLanguage():
punctuation_over_words
=
0
else
:
punctuation_over_words
=
self
.
N_punctuation
/
self
.
N_words
self
.
punctuation_over_words
=
punctuation_over_words
#print("PUNCTUATION MARKS = ", self.N_punctuation,'\n')
return
self
.
punctuation_over_words
,
self
.
N_punctuation
,
self
.
punctuation
,
self
.
N_words
def
sentenceComplexity
(
self
):
#Number of complex sentences
N_cs
=
0
for
sentence
in
self
.
sentences
:
...
...
@@ -149,25 +153,25 @@ class ComplexityLanguage():
else
:
previous_is_verb
=
False
if
count
>
0
:
N_cs
+=
1
N_cs
+=
1
self
.
N_cs
=
N_cs
#print("Number of complex sentences: ", self.N_cs, "\n")
ASL
=
self
.
N_words
/
self
.
N_sentences
self
.
ASL
=
ASL
#print("Average Sentence Length (ASL) = ", self.ASL, '\n')
CS
=
self
.
N_cs
/
self
.
N_sentences
self
.
CS
=
CS
#print("Complex Sentences (CS) = ", self.CS, '\n')
SCI
=
(
ASL
+
CS
)
/
2
self
.
SCI
=
SCI
#print("SENTENCE COMPLEXITY INDEX:(SCI) = ", self.SCI, "\n")
return
self
.
SCI
,
self
.
CS
,
self
.
N_cs
,
self
.
ASL
return
self
.
SCI
,
self
.
CS
,
self
.
N_cs
,
self
.
ASL
def
autoReadability
(
self
):
#Number of characters
count
=
0
...
...
@@ -178,24 +182,24 @@ class ComplexityLanguage():
count
+=
1
else
:
listwords
.
append
(
w
.
get_form
())
self
.
listwords
=
listwords
self
.
listwords
=
listwords
N_charac
=
0
for
characters
in
self
.
listwords
:
N_charac
+=
len
(
characters
)
self
.
N_charac
=
N_charac
#print("Number of characters: ", self.N_charac, "\n")
ARI
=
4.71
*
self
.
N_charac
/
self
.
N_words
+
0.5
*
self
.
N_words
/
self
.
N_sentences
-
21.43
self
.
ARI
=
ARI
#print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
return
self
.
ARI
,
self
.
N_charac
,
self
.
listwords
return
self
.
ARI
,
self
.
N_charac
,
self
.
listwords
def
mureadability
(
self
):
#Number of syllables and Number of words with 3 or more syllables:tagger
N_syllables
=
0
N_syllables3
=
0
...
...
@@ -207,10 +211,10 @@ class ComplexityLanguage():
count
+=
1
if
count
>=
3
:
N_syllables3
+=
1
self
.
N_syllables
=
N_syllables
self
.
N_syllables3
=
N_syllables3
#Number of letters
N_letters
=
0
letters
=
[]
...
...
@@ -220,33 +224,33 @@ class ComplexityLanguage():
letters
.
append
(
word
)
N_letters
+=
len
(
word
)
vecletters
.
append
(
len
(
word
))
self
.
letters
=
letters
self
.
N_letters
=
N_letters
self
.
vecletters
=
vecletters
x
=
self
.
N_letters
/
self
.
N_words
varianza
=
np
.
var
(
self
.
vecletters
)
mu
=
(
self
.
N_words
/
(
self
.
N_words
-
1
))
*
(
x
/
varianza
)
*
100
#print("READABILITY MU: ", mu, "\n")
self
.
mu
=
mu
return
self
.
mu
,
self
.
N_syllables
,
self
.
N_syllables3
,
self
.
letters
,
self
.
N_letters
,
self
.
vecletters
def
calcMetrics
(
self
,
text
):
"""
Calcula la métricas de complejidad activadas en la configuración
"""
"""
Calcula la métricas de complejidad activadas en la configuración
"""
self
.
textProcessing
(
text
)
metrics
=
{}
punctuationMarks
=
None
autoreadability
=
None
sentencecomplexity
=
None
for
i
in
range
(
0
,
len
(
self
.
metricsStr
)):
if
self
.
config
==
None
or
self
.
config
[
i
]
and
self
.
metricsStr
[
i
]
==
'AVERAGE PUNCTUATION MARKS'
:
punctuationmarks
=
self
.
punctuationMarks
()
metrics
[
'AVERAGE PUNCTUATION MARKS'
]
=
punctuationmarks
[
0
]
...
...
@@ -259,9 +263,9 @@ class ComplexityLanguage():
if
self
.
config
==
None
or
self
.
config
[
i
]
and
self
.
metricsStr
[
i
]
==
'MU'
:
mureadability
=
self
.
mureadability
()
metrics
[
'MU'
]
=
mureadability
[
0
]
return
metrics
return
metrics
def
getPOS
(
self
,
text
):
self
.
textProcessing
(
text
)
pos_sentences
=
[]
...
...
@@ -270,6 +274,5 @@ class ComplexityLanguage():
pos_sentences
.
append
([
w
.
get_tag
()
for
w
in
ws
])
#print('POS',pos_sentences)
self
.
pos_sentences
=
pos_sentences
return
self
.
pos_sentences
\ No newline at end of file
ComplexitySpanish.py
View file @
8f96e893
...
...
@@ -17,7 +17,7 @@ class ComplexitySpanish(ComplexityLanguage):
self
.
dep
=
freeling
.
dep_txala
(
self
.
DATA
+
self
.
lang
+
"/dep_txala/dependences.dat"
,
self
.
parser
.
get_start_symbol
())
# Para leer el texto que introducimos
CLASSDIR
=
"/home/
sinai/Experiments/CLEF-PAN
/"
CLASSDIR
=
"/home/
garciacumbreras18
/"
f
=
open
(
CLASSDIR
+
'CREA_total.txt'
)
lines
=
f
.
readlines
()
...
...
@@ -300,4 +300,4 @@ class ComplexitySpanish(ComplexityLanguage):
metrics
[
'CRAWFORD'
]
=
self
.
yearsCrawford
()
return
metrics
\ No newline at end of file
classify_comp
deleted
100755 → 0
View file @
1df9d9e2
#!/usr/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
# Authors:
# Rocío López-Anguita (rlanguit@ujaen.es)
# Arturo Montejo-Ráez (amontejo@ujaen.es)
# Centro de Estudios Avanzados en TIC (CEATIC)
# Universidad de Jaén
# 2018
import
json
import
os
from
ComplexityLanguage
import
ComplexityLanguage
from
ComplexitySpanish
import
ComplexitySpanish
from
ComplexityEnglish
import
ComplexityEnglish
from
ComplexityFrench
import
ComplexityFrench
from
ComplexityPolish
import
ComplexityPolish
from
ComplexityItalian
import
ComplexityItalian
import
pandas
as
pd
import
numpy
as
np
import
matplotlib.pyplot
as
plt
from
mpl_toolkits.mplot3d
import
Axes3D
from
sklearn.decomposition
import
PCA
from
sklearn
import
preprocessing
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
'PAN2018 author identificator based on text complexity metrics'
)
parser
.
add_argument
(
'-i'
,
type
=
string
,
help
=
'input directory'
)
parser
.
add_argument
(
'-o'
,
type
=
string
,
help
=
'output directory'
)
args
=
parser
.
parse_args
()
print
(
args
.
i
,
args
.
o
)
exit
()
print
(
'Loading complexity analyzers for different languages...
\n
'
,
flush
=
True
)
mlComplexityText
=
{
'en'
:
ComplexityEnglish
(),
'sp'
:
ComplexitySpanish
(),
'fr'
:
ComplexityFrench
(),
'pl'
:
ComplexityPolish
(),
'it'
:
ComplexityItalian
()
}
INPUT_DIR
=
args
.
i
OUTPUT_DIR
=
args
.
o
with
open
(
INPUT_DIR
+
'/collection-info.json'
,
'r'
)
as
f
:
collectionInfo
=
json
.
load
(
f
)
f
.
close
()
print
(
type
(
collectionInfo
))
\ No newline at end of file
classify_comp.py
0 → 100755
View file @
8f96e893
#!/home/garciacumbreras18/anaconda3/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
###############################################################################
# Authors:
# Rocío López-Anguita (rlanguit@ujaen.es)
# Arturo Montejo-Ráez (amontejo@ujaen.es)
# Centro de Estudios Avanzados en TIC (CEATIC)
#
# Universidad de Jaén - 2018
###############################################################################
import
json
import
os
from
ComplexityLanguage
import
ComplexityLanguage
from
ComplexitySpanish
import
ComplexitySpanish
from
ComplexityEnglish
import
ComplexityEnglish
from
ComplexityFrench
import
ComplexityFrench
from
ComplexityPolish
import
ComplexityPolish
from
ComplexityItalian
import
ComplexityItalian
import
pandas
as
pd
import
numpy
as
np
import
matplotlib.pyplot
as
plt
from
mpl_toolkits.mplot3d
import
Axes3D
from
sklearn.decomposition
import
PCA
from
sklearn
import
preprocessing
import
argparse
## ----------------------------------------------------------------------------
##
## Read command lines arguments
##
parser
=
argparse
.
ArgumentParser
(
description
=
'PAN2018 author identificator based on text complexity metrics'
)
parser
.
add_argument
(
'-i'
,
type
=
str
,
help
=
'input directory'
)
parser
.
add_argument
(
'-o'
,
type
=
str
,
help
=
'output directory'
)
args
=
parser
.
parse_args
()
INPUT_DIR
,
OUTPUT_DIR
=
args
.
i
,
args
.
o
## ----------------------------------------------------------------------------
##
## Load of analyzers
##
print
(
'Loading complexity analyzers for different languages...
\n
'
,
flush
=
True
)
mlComplexityText
=
{
'en'
:
ComplexityEnglish
(),
'sp'
:
ComplexitySpanish
(),
'fr'
:
ComplexityFrench
(),
'pl'
:
ComplexityPolish
(),
'it'
:
ComplexityItalian
()
}
## ----------------------------------------------------------------------------
##
## Corpus loading (both, train and test data sets)
##
complexity_known
=
pd
.
DataFrame
()
complexity_unknown
=
pd
.
DataFrame
()
labels
=
{}
labels_cand
=
[]
#
# Recorremos todos los problemas
#
print
(
'Loading collection-info.json file from'
,
args
.
i
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/collection-info.json'
,
'r'
)
as
f
:
collectionInfo
=
json
.
load
(
f
)
f
.
close
()
for
problem
in
collectionInfo
:
print
(
'
\n\n
Problem: '
,
problem
[
'problem-name'
],
flush
=
True
)
print
(
'Language: '
,
problem
[
'language'
],
flush
=
True
)
#
# Cargamos la clase para el cálculo de la complejidad del idioma correspondiente
#
complexityText
=
mlComplexityText
[
problem
[
'language'
]]
#
# Recorremos todos los candidatos
#
print
(
"Loading problem data...
\n
"
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/problem-info.json'
,
'r'
)
as
problem_info_fhd
:
problem_info
=
json
.
load
(
problem_info_fhd
)
problem_info_fhd
.
close
()
#
# Leemos los textos de autoría conocida (TEXTOS DE ENTRENAMIENTO)
#
print
(
"Loading training data"
)
for
candidate
in
problem_info
[
'candidate-authors'
]:
print
(
'Candidate: '
,
candidate
[
'author-name'
],
flush
=
True
)
files
=
os
.
listdir
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
candidate
[
'author-name'
]))
probcand
=
problem
[
'problem-name'
]
+
candidate
[
'author-name'
]
if
not
probcand
in
labels
:
labels
[
probcand
]
=
len
(
labels
)
labels_cand
+=
[
probcand
]
#
# Procesamos todo los textos de ese candidato
#
for
i
,
nameFile
in
enumerate
(
files
):
print
(
'Reading text file: '
,
nameFile
,
flush
=
True
)
with
open
(
os
.
path
.
join
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
candidate
[
'author-name'
]),
nameFile
),
'r'
)
as
context
:
calcmetrics
=
complexityText
.
calcMetrics
(
context
.
read
())
dfi
=
pd
.
DataFrame
(
calcmetrics
,
index
=
[
i
])
dfi
[
'problem'
]
=
problem
[
'problem-name'
]
dfi
[
'language'
]
=
problem
[
'language'
]
dfi
[
'candidate'
]
=
candidate
[
'author-name'
]
dfi
[
'label'
]
=
labels
[
probcand
]
dfi
[
'filename'
]
=
nameFile
complexity_known
=
complexity_known
.
append
([
dfi
])
#
# Si existe ground-truth, lo leemos para conocer los candidatos
#
unknown_candidates
=
False
if
os
.
path
.
isfile
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/ground-truth.json'
):
print
(
"Reading ground truth..."
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/ground-truth.json'
,
'r'
)
as
ground_truth_fhd
:
ground_truth
=
json
.
load
(
ground_truth_fhd
)
ground_truth_fhd
.
close
()
unknown_candidates
=
{}
for
item
in
ground_truth
[
'ground_truth'
]:
unknown_candidates
[
item
[
'unknown-text'
]]
=
item
[
'true-author'
]
#
# Recorremos archivos sin etiquetar (TEXTOS DE TEST)
#
print
(
"Loading test data"
,
flush
=
True
)
for
i
,
unknown_file
in
enumerate
(
os
.
listdir
(
os
.
path
.
join
(
INPUT_DIR
,
problem
[
'problem-name'
],
problem_info
[
'unknown-folder'
]))):
print
(
"Analyzing file"
,
unknown_file
,
flush
=
True
)
with
open
(
INPUT_DIR
+
'/'
+
problem
[
'problem-name'
]
+
'/'
+
problem_info
[
'unknown-folder'
]
+
'/'
+
unknown_file
,
'r'
)
as
unknown_fhd
:
calcmetrics
=
complexityText
.
calcMetrics
(
unknown_fhd
.
read
())
unknown_fhd
.
close
()
dfi
=
pd
.
DataFrame
(
calcmetrics
,
index
=
[
i
])
dfi
[
'problem'
]
=
problem
[
'problem-name'
]
dfi
[
'language'
]
=
problem
[
'language'
]
if
unknown_candidates
and
unknown_candidates
[
unknown_file
]:
probcand
=
problem
[
'problem-name'
]
+
unknown_candidates
[
unknown_file
]
dfi
[
'candidate'
]
=
unknown_candidates
[
unknown_file
]
dfi
[
'label'
]
=
labels
[
probcand
]
else
:
dfi
[
'candidate'
]
=
None
dfi
[
'label'
]
=
None
dfi
[
'filename'
]
=
unknown_file
complexity_unknown
=
complexity_unknown
.
append
([
dfi
])
## ----------------------------------------------------------------------------
##
## Training and classification
##
if
not
os
.
path
.
exists
(
OUTPUT_DIR
):
os
.
makedirs
(
OUTPUT_DIR
)
from
sklearn
import
svm
clf
=
svm
.
LinearSVC
(
C
=
1
)
for
problem
in
set
(
complexity_known
[
'problem'
]):
answers
=
[]
print
(
'------- Training and classifying '
,
problem
,
flush
=
True
)
#
# Para el train cogemos los textos conocidos
#
train
=
complexity_known
.
loc
[
complexity_known
[
'problem'
]
==
problem
]
train
=
train
.
dropna
(
axis
=
1
,
how
=
'any'
)
train_target
=
train
[
'label'
]
train_data
=
train
.
drop
([
'problem'
,
'language'
,
'candidate'
,
'filename'
,
'label'
],
axis
=
1
)
train_data
=
pd
.
DataFrame
(
preprocessing
.
normalize
(
train_data
,
norm
=
'l2'
))
#
# Para el test cogemos los textos desconocidos
#
test
=
complexity_unknown
.
loc
[
complexity_unknown
[
'problem'
]
==
problem
]
test
=
test
.
dropna
(
axis
=
1
,
how
=
'any'
)
test_target
=
test
[
'label'
]
test_data
=
test
.
drop
([
'problem'
,
'language'
,
'candidate'
,
'filename'
,
'label'
],
axis
=
1
)
test_data
=
pd
.
DataFrame
(
preprocessing
.
normalize
(
test_data
,
norm
=
'l2'
))
#Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred
=
clf
.
fit
(
train_data
,
train_target
)
.
predict
(
test_data
)
for
index
,
row
in
test
.
iterrows
():
probcand
=
labels_cand
[
y_pred
[
index
]]
answers
.
append
({
'unknown-text'
:
row
[
'filename'
],
'predicted-author'
:
probcand
[
probcand
.
find
(
"candidate"
):],
})
with
open
(
OUTPUT_DIR
+
'/answers-'
+
problem
+
'.json'
,
'w'
)
as
file
:
json
.
dump
(
answers
,
file
,
indent
=
4
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment