classify_comp working on tira.io

parent 1df9d9e2
__pycache__
import sys
sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
from ComplexityLanguage import ComplexityLanguage from ComplexityLanguage import ComplexityLanguage
import re import re
import math import math
...@@ -179,4 +181,4 @@ class ComplexityEnglish(ComplexityLanguage): ...@@ -179,4 +181,4 @@ class ComplexityEnglish(ComplexityLanguage):
return metrics return metrics
\ No newline at end of file
import sys
sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
from ComplexityLanguage import ComplexityLanguage from ComplexityLanguage import ComplexityLanguage
import freeling import freeling
import os import os
...@@ -15,9 +17,9 @@ class ComplexityFrench(ComplexityLanguage): ...@@ -15,9 +17,9 @@ class ComplexityFrench(ComplexityLanguage):
ComplexityLanguage.__init__(self, lang) ComplexityLanguage.__init__(self, lang)
## Modify this line to be your FreeLing installation directory ## Modify this line to be your FreeLing installation directory
FREELINGDIR = "/usr/local" FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
DATA = FREELINGDIR+"/share/freeling/" DATA = FREELINGDIR+"/data/"
CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/" CLASSDIR = ""
self.lang = lang self.lang = lang
freeling.util_init_locale("default") freeling.util_init_locale("default")
...@@ -50,12 +52,8 @@ class ComplexityFrench(ComplexityLanguage): ...@@ -50,12 +52,8 @@ class ComplexityFrench(ComplexityLanguage):
self.tg=freeling.hmm_tagger(DATA+lang+"/tagger.dat",True,2) self.tg=freeling.hmm_tagger(DATA+lang+"/tagger.dat",True,2)
self.sen=freeling.senses(DATA+lang+"/senses.dat") self.sen=freeling.senses(DATA+lang+"/senses.dat")
#Listas de palabras de Dale-Chall
CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/"
f = open(CLASSDIR + 'DaleChall.txt') f = open(CLASSDIR + '/home/garciacumbreras18/DaleChall.txt')
lines = f.readlines() lines = f.readlines()
f.close() f.close()
...@@ -143,4 +141,4 @@ class ComplexityFrench(ComplexityLanguage): ...@@ -143,4 +141,4 @@ class ComplexityFrench(ComplexityLanguage):
return metrics return metrics
\ No newline at end of file
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import sys
sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
import freeling import freeling
import os import os
import re import re
...@@ -12,9 +14,9 @@ class ComplexityItalian(): ...@@ -12,9 +14,9 @@ class ComplexityItalian():
def __init__(self, lang = 'it'): def __init__(self, lang = 'it'):
## Modify this line to be your FreeLing installation directory ## Modify this line to be your FreeLing installation directory
FREELINGDIR = "/usr/local" FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
DATA = FREELINGDIR+"/share/freeling/" DATA = FREELINGDIR+"/data/"
CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/"
self.DATA = DATA self.DATA = DATA
self.lang = lang self.lang = lang
freeling.util_init_locale("default") freeling.util_init_locale("default")
...@@ -279,4 +281,4 @@ class ComplexityItalian(): ...@@ -279,4 +281,4 @@ class ComplexityItalian():
\ No newline at end of file
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import sys
sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
import freeling import freeling
import os import os
import re import re
...@@ -8,14 +12,14 @@ import scipy.stats ...@@ -8,14 +12,14 @@ import scipy.stats
import math import math
class ComplexityLanguage(): class ComplexityLanguage():
def __init__(self, lang): def __init__(self, lang):
## Modify this line to be your FreeLing installation directory ## Modify this line to be your FreeLing installation directory
FREELINGDIR = "/usr/local" FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
DATA = FREELINGDIR+"/share/freeling/" DATA = FREELINGDIR+"/data/"
self.DATA = DATA self.DATA = DATA
self.lang = lang self.lang = lang
freeling.util_init_locale("default") freeling.util_init_locale("default")
...@@ -24,49 +28,49 @@ class ComplexityLanguage(): ...@@ -24,49 +28,49 @@ class ComplexityLanguage():
# create options set for maco analyzer. Default values are Ok, except for data files. # create options set for maco analyzer. Default values are Ok, except for data files.
op= freeling.maco_options(lang) op= freeling.maco_options(lang)
op.set_data_files( "", op.set_data_files( "",
self.DATA + "common/punct.dat", self.DATA + "common/punct.dat",
self.DATA + self.lang + "/dicc.src", self.DATA + self.lang + "/dicc.src",
self.DATA + self.lang + "/afixos.dat", self.DATA + self.lang + "/afixos.dat",
"", "",
self.DATA + self.lang + "/locucions.dat", self.DATA + self.lang + "/locucions.dat",
self.DATA + self.lang + "/np.dat", self.DATA + self.lang + "/np.dat",
self.DATA + self.lang + "/quantities.dat", self.DATA + self.lang + "/quantities.dat",
self.DATA + self.lang + "/probabilitats.dat") self.DATA + self.lang + "/probabilitats.dat")
# create analyzers # create analyzers
self.tk=freeling.tokenizer(self.DATA+self.lang+"/tokenizer.dat") self.tk=freeling.tokenizer(self.DATA+self.lang+"/tokenizer.dat")
#self.sp=freeling.splitter("/home/sinai/Freeling/data/"+self.lang+"/splitter.dat") #self.sp=freeling.splitter("/home/sinai/Freeling/data/"+self.lang+"/splitter.dat")
self.sp=freeling.splitter(self.DATA+self.lang+"/splitter.dat") self.sp=freeling.splitter(self.DATA+self.lang+"/splitter.dat")
self.mf=freeling.maco(op) self.mf=freeling.maco(op)
# activate mmorpho modules to be used in next call # activate mmorpho modules to be used in next call
self.mf.set_active_options(False, True, True, True, # select which among created self.mf.set_active_options(False, True, True, True, # select which among created
True, True, False, True, # submodules are to be used. True, True, False, True, # submodules are to be used.
True, True, True, True ) # default: all created submodules are used True, True, True, True ) # default: all created submodules are used
# create tagger, sense anotator, and parsers # create tagger, sense anotator, and parsers
self.tg=freeling.hmm_tagger(self.DATA+self.lang+"/tagger.dat",True,2) self.tg=freeling.hmm_tagger(self.DATA+self.lang+"/tagger.dat",True,2)
self.sen=freeling.senses(self.DATA+self.lang+"/senses.dat") self.sen=freeling.senses(self.DATA+self.lang+"/senses.dat")
#self.parser= freeling.chart_parser(DATA+lang+"/chunker/grammar-chunk.dat") #self.parser= freeling.chart_parser(DATA+lang+"/chunker/grammar-chunk.dat")
#self.dep=freeling.dep_txala(DATA+lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol()) #self.dep=freeling.dep_txala(DATA+lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol())
""" """
config es una lista de valores booleanos que activa o desactivan el cálculo de una medida config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
config = [ config = [
True|False, # PUNCTUATION MARKS True|False, # PUNCTUATION MARKS
True|False, # SCI True|False, # SCI
True|False, # ARI True|False, # ARI
True|False, # MU True|False, # MU
] ]
Si config == None se calculan todas las métricas de complejidad soportadas Si config == None se calculan todas las métricas de complejidad soportadas
""" """
self.config = [True, True, True, True] self.config = [True, True, True, True]
self.metricsStr = ['AVERAGE PUNCTUATION MARKS', 'SCI', 'ARI', 'MU'] self.metricsStr = ['AVERAGE PUNCTUATION MARKS', 'SCI', 'ARI', 'MU']
pass pass
def textProcessing(self, text): def textProcessing(self, text):
text = text.replace(u'\xa0', u' ').replace('"', '') text = text.replace(u'\xa0', u' ').replace('"', '')
# meter todas las funciones en una patron de los tokens válidos # meter todas las funciones en una patron de los tokens válidos
...@@ -86,9 +90,9 @@ class ComplexityLanguage(): ...@@ -86,9 +90,9 @@ class ComplexityLanguage():
#ls = self.dep.analyze(ls) #ls = self.dep.analyze(ls)
#print("After dependencies", len(ls)) #print("After dependencies", len(ls))
self.sentences = ls self.sentences = ls
self.N_sentences = len(ls) self.N_sentences = len(ls)
self.sp.close_session(sid) self.sp.close_session(sid)
#print('Las oraciones: ', self.sentences) #print('Las oraciones: ', self.sentences)
''' '''
Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos
...@@ -97,11 +101,11 @@ class ComplexityLanguage(): ...@@ -97,11 +101,11 @@ class ComplexityLanguage():
for sentence in self.sentences: for sentence in self.sentences:
ws = sentence.get_words(); ws = sentence.get_words();
pos_content_sentences.append([w for w in ws if re.match('N.*|V.*|A.*', w.get_tag())]) pos_content_sentences.append([w for w in ws if re.match('N.*|V.*|A.*', w.get_tag())])
self.pos_content_sentences = pos_content_sentences self.pos_content_sentences = pos_content_sentences
return self.pos_content_sentences, self.sentences, self.N_sentences return self.pos_content_sentences, self.sentences, self.N_sentences
def punctuationMarks(self): def punctuationMarks(self):
#Solo nos interesa contar los tokens que sean signo de puntuación. #Solo nos interesa contar los tokens que sean signo de puntuación.
#Number of words. #Number of words.
...@@ -114,10 +118,10 @@ class ComplexityLanguage(): ...@@ -114,10 +118,10 @@ class ComplexityLanguage():
else: else:
lwords.append(w.get_form()) lwords.append(w.get_form())
self.N_words = len(lwords) self.N_words = len(lwords)
#print('Number of words (N_w): ', self.N_words, '\n' ) #print('Number of words (N_w): ', self.N_words, '\n' )
self.N_punctuation = len(punctuation) self.N_punctuation = len(punctuation)
self.punctuation = punctuation self.punctuation = punctuation
...@@ -125,14 +129,14 @@ class ComplexityLanguage(): ...@@ -125,14 +129,14 @@ class ComplexityLanguage():
punctuation_over_words = 0 punctuation_over_words = 0
else: else:
punctuation_over_words = self.N_punctuation / self.N_words punctuation_over_words = self.N_punctuation / self.N_words
self.punctuation_over_words = punctuation_over_words self.punctuation_over_words = punctuation_over_words
#print("PUNCTUATION MARKS = ", self.N_punctuation,'\n') #print("PUNCTUATION MARKS = ", self.N_punctuation,'\n')
return self.punctuation_over_words, self.N_punctuation, self.punctuation, self.N_words return self.punctuation_over_words, self.N_punctuation, self.punctuation, self.N_words
def sentenceComplexity(self): def sentenceComplexity(self):
#Number of complex sentences #Number of complex sentences
N_cs = 0 N_cs = 0
for sentence in self.sentences: for sentence in self.sentences:
...@@ -149,25 +153,25 @@ class ComplexityLanguage(): ...@@ -149,25 +153,25 @@ class ComplexityLanguage():
else: else:
previous_is_verb = False previous_is_verb = False
if count>0: if count>0:
N_cs += 1 N_cs += 1
self.N_cs = N_cs self.N_cs = N_cs
#print("Number of complex sentences: ", self.N_cs, "\n") #print("Number of complex sentences: ", self.N_cs, "\n")
ASL = self.N_words / self.N_sentences ASL = self.N_words / self.N_sentences
self.ASL = ASL self.ASL = ASL
#print("Average Sentence Length (ASL) = ", self.ASL, '\n') #print("Average Sentence Length (ASL) = ", self.ASL, '\n')
CS = self.N_cs / self.N_sentences CS = self.N_cs / self.N_sentences
self.CS = CS self.CS = CS
#print("Complex Sentences (CS) = ", self.CS, '\n') #print("Complex Sentences (CS) = ", self.CS, '\n')
SCI = (ASL + CS)/ 2 SCI = (ASL + CS)/ 2
self.SCI = SCI self.SCI = SCI
#print("SENTENCE COMPLEXITY INDEX:(SCI) = ", self.SCI, "\n") #print("SENTENCE COMPLEXITY INDEX:(SCI) = ", self.SCI, "\n")
return self.SCI, self.CS, self.N_cs, self.ASL return self.SCI, self.CS, self.N_cs, self.ASL
def autoReadability(self): def autoReadability(self):
#Number of characters #Number of characters
count = 0 count = 0
...@@ -178,24 +182,24 @@ class ComplexityLanguage(): ...@@ -178,24 +182,24 @@ class ComplexityLanguage():
count +=1 count +=1
else: else:
listwords.append(w.get_form()) listwords.append(w.get_form())
self.listwords = listwords self.listwords = listwords
N_charac = 0 N_charac = 0
for characters in self.listwords: for characters in self.listwords:
N_charac += len(characters) N_charac += len(characters)
self.N_charac = N_charac self.N_charac = N_charac
#print("Number of characters: ", self.N_charac, "\n") #print("Number of characters: ", self.N_charac, "\n")
ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words / self.N_sentences - 21.43 ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words / self.N_sentences - 21.43
self.ARI = ARI self.ARI = ARI
#print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n') #print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
return self.ARI, self.N_charac, self.listwords return self.ARI, self.N_charac, self.listwords
def mureadability(self): def mureadability(self):
#Number of syllables and Number of words with 3 or more syllables:tagger #Number of syllables and Number of words with 3 or more syllables:tagger
N_syllables = 0 N_syllables = 0
N_syllables3 = 0 N_syllables3 = 0
...@@ -207,10 +211,10 @@ class ComplexityLanguage(): ...@@ -207,10 +211,10 @@ class ComplexityLanguage():
count+=1 count+=1
if count>=3: if count>=3:
N_syllables3 += 1 N_syllables3 += 1
self.N_syllables = N_syllables self.N_syllables = N_syllables
self.N_syllables3 = N_syllables3 self.N_syllables3 = N_syllables3
#Number of letters #Number of letters
N_letters= 0 N_letters= 0
letters = [] letters = []
...@@ -220,33 +224,33 @@ class ComplexityLanguage(): ...@@ -220,33 +224,33 @@ class ComplexityLanguage():
letters.append(word) letters.append(word)
N_letters+=len(word) N_letters+=len(word)
vecletters.append(len(word)) vecletters.append(len(word))
self.letters = letters self.letters = letters
self.N_letters = N_letters self.N_letters = N_letters
self.vecletters = vecletters self.vecletters = vecletters
x=self.N_letters / self.N_words x=self.N_letters / self.N_words
varianza=np.var(self.vecletters) varianza=np.var(self.vecletters)
mu = (self.N_words /(self.N_words-1))*(x/varianza)*100 mu = (self.N_words /(self.N_words-1))*(x/varianza)*100
#print("READABILITY MU: ", mu, "\n") #print("READABILITY MU: ", mu, "\n")
self.mu = mu self.mu = mu
return self.mu, self.N_syllables, self.N_syllables3, self.letters, self.N_letters, self.vecletters return self.mu, self.N_syllables, self.N_syllables3, self.letters, self.N_letters, self.vecletters
def calcMetrics(self, text): def calcMetrics(self, text):
""" """
Calcula la métricas de complejidad activadas en la configuración Calcula la métricas de complejidad activadas en la configuración
""" """
self.textProcessing(text) self.textProcessing(text)
metrics = {} metrics = {}
punctuationMarks = None punctuationMarks = None
autoreadability = None autoreadability = None
sentencecomplexity = None sentencecomplexity = None
for i in range(0, len(self.metricsStr)): for i in range(0, len(self.metricsStr)):
if self.config == None or self.config[i] and self.metricsStr[i] == 'AVERAGE PUNCTUATION MARKS': if self.config == None or self.config[i] and self.metricsStr[i] == 'AVERAGE PUNCTUATION MARKS':
punctuationmarks = self.punctuationMarks() punctuationmarks = self.punctuationMarks()
metrics['AVERAGE PUNCTUATION MARKS'] = punctuationmarks[0] metrics['AVERAGE PUNCTUATION MARKS'] = punctuationmarks[0]
...@@ -259,9 +263,9 @@ class ComplexityLanguage(): ...@@ -259,9 +263,9 @@ class ComplexityLanguage():
if self.config == None or self.config[i] and self.metricsStr[i] == 'MU': if self.config == None or self.config[i] and self.metricsStr[i] == 'MU':
mureadability = self. mureadability() mureadability = self. mureadability()
metrics['MU'] = mureadability[0] metrics['MU'] = mureadability[0]
return metrics return metrics
def getPOS(self, text): def getPOS(self, text):
self.textProcessing(text) self.textProcessing(text)
pos_sentences = [] pos_sentences = []
...@@ -270,6 +274,5 @@ class ComplexityLanguage(): ...@@ -270,6 +274,5 @@ class ComplexityLanguage():
pos_sentences.append([w.get_tag() for w in ws]) pos_sentences.append([w.get_tag() for w in ws])
#print('POS',pos_sentences) #print('POS',pos_sentences)
self.pos_sentences = pos_sentences self.pos_sentences = pos_sentences
return self.pos_sentences return self.pos_sentences
\ No newline at end of file
...@@ -17,7 +17,7 @@ class ComplexitySpanish(ComplexityLanguage): ...@@ -17,7 +17,7 @@ class ComplexitySpanish(ComplexityLanguage):
self.dep=freeling.dep_txala(self.DATA+self.lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol()) self.dep=freeling.dep_txala(self.DATA+self.lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol())
# Para leer el texto que introducimos # Para leer el texto que introducimos
CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/" CLASSDIR = "/home/garciacumbreras18/"
f = open(CLASSDIR + 'CREA_total.txt') f = open(CLASSDIR + 'CREA_total.txt')
lines = f.readlines() lines = f.readlines()
...@@ -300,4 +300,4 @@ class ComplexitySpanish(ComplexityLanguage): ...@@ -300,4 +300,4 @@ class ComplexitySpanish(ComplexityLanguage):
metrics['CRAWFORD'] = self.yearsCrawford() metrics['CRAWFORD'] = self.yearsCrawford()
return metrics return metrics
\ No newline at end of file
#!/usr/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
# Authors:
# Rocío López-Anguita (rlanguit@ujaen.es)
# Arturo Montejo-Ráez (amontejo@ujaen.es)
# Centro de Estudios Avanzados en TIC (CEATIC)
# Universidad de Jaén
# 2018
import json
import os
from ComplexityLanguage import ComplexityLanguage
from ComplexitySpanish import ComplexitySpanish
from ComplexityEnglish import ComplexityEnglish
from ComplexityFrench import ComplexityFrench
from ComplexityPolish import ComplexityPolish
from ComplexityItalian import ComplexityItalian
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn import preprocessing
import argparse
parser = argparse.ArgumentParser(description='PAN2018 author identificator based on text complexity metrics')
parser.add_argument('-i', type=string, help='input directory')
parser.add_argument('-o', type=string, help='output directory')
args = parser.parse_args()
print(args.i, args.o)
exit()
print('Loading complexity analyzers for different languages...\n', flush=True)
mlComplexityText = {
'en': ComplexityEnglish(),
'sp': ComplexitySpanish(),
'fr': ComplexityFrench(),
'pl': ComplexityPolish(),
'it': ComplexityItalian()
}
INPUT_DIR = args.i
OUTPUT_DIR = args.o
with open(INPUT_DIR+'/collection-info.json', 'r') as f:
collectionInfo = json.load(f)
f.close()
print(type(collectionInfo))
\ No newline at end of file
#!/home/garciacumbreras18/anaconda3/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
###############################################################################
# Authors:
# Rocío López-Anguita (rlanguit@ujaen.es)
# Arturo Montejo-Ráez (amontejo@ujaen.es)
# Centro de Estudios Avanzados en TIC (CEATIC)
#
# Universidad de Jaén - 2018
###############################################################################
import json
import os
from ComplexityLanguage import ComplexityLanguage
from ComplexitySpanish import ComplexitySpanish
from ComplexityEnglish import ComplexityEnglish
from ComplexityFrench import ComplexityFrench
from ComplexityPolish import ComplexityPolish
from ComplexityItalian import ComplexityItalian
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn import preprocessing
import argparse
## ----------------------------------------------------------------------------
##
## Read command lines arguments
##
parser = argparse.ArgumentParser(description='PAN2018 author identificator based on text complexity metrics')
parser.add_argument('-i', type=str, help='input directory')
parser.add_argument('-o', type=str, help='output directory')
args = parser.parse_args()
INPUT_DIR, OUTPUT_DIR = args.i, args.o
## ----------------------------------------------------------------------------
##
## Load of analyzers
##
print('Loading complexity analyzers for different languages...\n', flush=True)
mlComplexityText = {
'en': ComplexityEnglish(),
'sp': ComplexitySpanish(),
'fr': ComplexityFrench(),
'pl': ComplexityPolish(),
'it': ComplexityItalian()
}
## ----------------------------------------------------------------------------
##
## Corpus loading (both, train and test data sets)
##
complexity_known = pd.DataFrame()
complexity_unknown = pd.DataFrame()
labels = {}
labels_cand = []
#
# Recorremos todos los problemas
#
print('Loading collection-info.json file from', args.i, flush=True)
with open(INPUT_DIR+'/collection-info.json', 'r') as f:
collectionInfo = json.load(f)
f.close()
for problem in collectionInfo:
print('\n\nProblem: ', problem['problem-name'], flush=True)
print('Language: ', problem['language'], flush=True)
#
# Cargamos la clase para el cálculo de la complejidad del idioma correspondiente
#
complexityText = mlComplexityText[problem['language']]
#
# Recorremos todos los candidatos
#
print("Loading problem data...\n", flush=True)
with open(INPUT_DIR + '/' + problem['problem-name'] + '/problem-info.json', 'r') as problem_info_fhd:
problem_info= json.load(problem_info_fhd)
problem_info_fhd.close()
#
# Leemos los textos de autoría conocida (TEXTOS DE ENTRENAMIENTO)
#
print("Loading training data")
for candidate in problem_info['candidate-authors']:
print('Candidate: ', candidate['author-name'], flush=True)
files = os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], candidate['author-name']))
probcand = problem['problem-name'] + candidate['author-name']
if not probcand in labels:
labels[probcand] = len(labels)
labels_cand += [probcand]
#
# Procesamos todo los textos de ese candidato
#
for i, nameFile in enumerate(files):
print('Reading text file: ', nameFile, flush=True)
with open(os.path.join(os.path.join(INPUT_DIR,problem['problem-name'], candidate['author-name']), nameFile),'r') as context:
calcmetrics = complexityText.calcMetrics(context.read())
dfi = pd.DataFrame(calcmetrics, index=[i])
dfi['problem'] = problem['problem-name']
dfi['language'] = problem['language']
dfi['candidate'] = candidate['author-name']
dfi['label'] = labels[probcand]
dfi['filename'] = nameFile
complexity_known = complexity_known.append([dfi])
#
# Si existe ground-truth, lo leemos para conocer los candidatos
#
unknown_candidates = False
if os.path.isfile(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json'):
print("Reading ground truth...", flush=True)
with open(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json', 'r') as ground_truth_fhd:
ground_truth = json.load(ground_truth_fhd)
ground_truth_fhd.close()
unknown_candidates = {}
for item in ground_truth['ground_truth']:
unknown_candidates[item['unknown-text']] = item['true-author']
#
# Recorremos archivos sin etiquetar (TEXTOS DE TEST)
#
print("Loading test data", flush=True)
for i, unknown_file in enumerate(os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], problem_info['unknown-folder']))):
print("Analyzing file", unknown_file, flush=True)
with open(INPUT_DIR + '/' + problem['problem-name'] + '/' + problem_info['unknown-folder'] + '/' + unknown_file, 'r') as unknown_fhd:
calcmetrics = complexityText.calcMetrics(unknown_fhd.read())
unknown_fhd.close()
dfi = pd.DataFrame(calcmetrics, index=[i])
dfi['problem'] = problem['problem-name']
dfi['language'] = problem['language']
if unknown_candidates and unknown_candidates[unknown_file]:
probcand = problem['problem-name'] + unknown_candidates[unknown_file]
dfi['candidate'] = unknown_candidates[unknown_file]
dfi['label'] = labels[probcand]
else:
dfi['candidate'] = None
dfi['label'] = None
dfi['filename'] = unknown_file
complexity_unknown = complexity_unknown.append([dfi])
## ----------------------------------------------------------------------------
##
## Training and classification
##
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
from sklearn import svm
clf = svm.LinearSVC(C=1)
for problem in set(complexity_known['problem']):
answers = []
print('------- Training and classifying ', problem, flush=True)
#
# Para el train cogemos los textos conocidos
#
train = complexity_known.loc[complexity_known['problem'] == problem]
train = train.dropna(axis=1, how='any')
train_target = train['label']
train_data= train.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
train_data = pd.DataFrame(preprocessing.normalize(train_data, norm='l2'))
#
# Para el test cogemos los textos desconocidos
#
test = complexity_unknown.loc[complexity_unknown['problem'] == problem]
test = test.dropna(axis=1, how='any')
test_target = test['label']
test_data = test.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
test_data = pd.DataFrame(preprocessing.normalize(test_data, norm='l2'))
#Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred = clf.fit(train_data, train_target).predict(test_data)
for index, row in test.iterrows():
probcand = labels_cand[y_pred[index]]
answers.append({
'unknown-text': row['filename'],
'predicted-author': probcand[probcand.find("candidate"):],
})
with open(OUTPUT_DIR + '/answers-' + problem +'.json', 'w') as file:
json.dump(answers, file, indent=4)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment