official version

parent 9622b455
...@@ -116,7 +116,7 @@ class ComplexityLanguage(): ...@@ -116,7 +116,7 @@ class ComplexityLanguage():
lsentences=[] lsentences=[]
for words in self.sentences: for words in self.sentences:
lwords = [] lwords = []
for w in words: for w in words:
if re.match('F.*', w.get_tag()): if re.match('F.*', w.get_tag()):
punctuation.append(w.get_form()) punctuation.append(w.get_form())
...@@ -127,7 +127,7 @@ class ComplexityLanguage(): ...@@ -127,7 +127,7 @@ class ComplexityLanguage():
self.N_words = sum(lsentences) self.N_words = sum(lsentences)
#print('Number of words (N_w): ', self.N_words, '\n' ) #print('Number of words (N_w): ', self.N_words, '\n' )
self.mean_words = np.mean(lsentences) self.mean_words = np.mean(lsentences)
self.std_words = np.std(lsentences) self.std_words = np.std(lsentences)
self.N_punctuation = len(punctuation) self.N_punctuation = len(punctuation)
self.punctuation = punctuation self.punctuation = punctuation
......
...@@ -2,6 +2,7 @@ import sys ...@@ -2,6 +2,7 @@ import sys
sys.path.append('/home/garciacumbreras18/dist/treetagger') sys.path.append('/home/garciacumbreras18/dist/treetagger')
import nltk import nltk
import numpy as np
import re import re
from treetagger import TreeTagger from treetagger import TreeTagger
...@@ -208,7 +209,12 @@ class ComplexityPolish(): ...@@ -208,7 +209,12 @@ class ComplexityPolish():
pos_sentences = [] pos_sentences = []
sent = [] sent = []
for w in sentences: for w in sentences:
tag = w[1].split(':')[0] if len(w) < 3:
continue
if w[1].find(':') == -1:
tag = w[1]
else:
tag = w[1].split(':')[0]
if tag == 'SENT': if tag == 'SENT':
pos_sentences.append(sent) pos_sentences.append(sent)
sent = [] sent = []
...@@ -218,7 +224,7 @@ class ComplexityPolish(): ...@@ -218,7 +224,7 @@ class ComplexityPolish():
return self.pos_sentences return self.pos_sentences
def calcMetricsExtend(self, text): def calcMetricsExtend(self, text):
""" """
Calcula la métricas de complejidad activadas en la configuración Calcula la métricas de complejidad activadas en la configuración
""" """
......
...@@ -307,7 +307,7 @@ class ComplexitySpanish(ComplexityLanguage): ...@@ -307,7 +307,7 @@ class ComplexitySpanish(ComplexityLanguage):
return metrics return metrics
def calcMetricsExtend(self, text): def calcMetricsExtend(self, text):
""" """
Calcula la métricas de complejidad activadas en la configuración Calcula la métricas de complejidad activadas en la configuración
""" """
......
...@@ -111,7 +111,9 @@ for problem in collectionInfo: ...@@ -111,7 +111,9 @@ for problem in collectionInfo:
print('Reading text file: ', nameFile, flush=True) print('Reading text file: ', nameFile, flush=True)
with open(os.path.join(os.path.join(INPUT_DIR,problem['problem-name'], candidate['author-name']), nameFile),'r') as context: with open(os.path.join(os.path.join(INPUT_DIR,problem['problem-name'], candidate['author-name']), nameFile),'r') as context:
calcmetrics = complexityText.calcMetrics(context.read()) content = context.read()
context.close()
calcmetrics = complexityText.calcMetrics(content)
dfi = pd.DataFrame(calcmetrics, index=[i]) dfi = pd.DataFrame(calcmetrics, index=[i])
dfi['problem'] = problem['problem-name'] dfi['problem'] = problem['problem-name']
...@@ -142,9 +144,10 @@ for problem in collectionInfo: ...@@ -142,9 +144,10 @@ for problem in collectionInfo:
for i, unknown_file in enumerate(os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], problem_info['unknown-folder']))): for i, unknown_file in enumerate(os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], problem_info['unknown-folder']))):
print("Analyzing file", unknown_file, flush=True) print("Analyzing file", unknown_file, flush=True)
with open(INPUT_DIR + '/' + problem['problem-name'] + '/' + problem_info['unknown-folder'] + '/' + unknown_file, 'r') as unknown_fhd: with open(INPUT_DIR + '/' + problem['problem-name'] + '/' + problem_info['unknown-folder'] + '/' + unknown_file, 'r') as unknown_fhd:
calcmetrics = complexityText.calcMetrics(unknown_fhd.read()) content = unknown_fhd.read()
unknown_fhd.close() unknown_fhd.close()
calcmetrics = complexityText.calcMetrics(content)
dfi = pd.DataFrame(calcmetrics, index=[i]) dfi = pd.DataFrame(calcmetrics, index=[i])
dfi['problem'] = problem['problem-name'] dfi['problem'] = problem['problem-name']
dfi['language'] = problem['language'] dfi['language'] = problem['language']
...@@ -181,17 +184,23 @@ for problem in set(complexity_known['problem']): ...@@ -181,17 +184,23 @@ for problem in set(complexity_known['problem']):
train = complexity_known.loc[complexity_known['problem'] == problem] train = complexity_known.loc[complexity_known['problem'] == problem]
train = train.dropna(axis=1, how='any') train = train.dropna(axis=1, how='any')
train_target = train['label'] train_target = train['label']
train_data= train.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1) train_data = train.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
train_data = pd.DataFrame(preprocessing.normalize(train_data, norm='l2', axis=args.axis))
# #
# Para el test cogemos los textos desconocidos # Para el test cogemos los textos desconocidos
# #
test = complexity_unknown.loc[complexity_unknown['problem'] == problem] test = complexity_unknown.loc[complexity_unknown['problem'] == problem]
test = test.dropna(axis=1, how='any') test = test.dropna(axis=1, how='any')
test_target = test['label'] test_data = test.drop(['problem', 'language', 'filename'], axis=1)
test_data = test.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
test_data = pd.DataFrame(preprocessing.normalize(test_data, norm='l2', axis=args.axis)) #
# Normalizamos
#
data = pd.concat([train_data, test_data])
data = pd.DataFrame(preprocessing.normalize(data, norm='l2', axis=args.axis))
train_data = data.iloc[:train_data.shape[0],:]
test_data = data.iloc[train_data.shape[0]:,:]
# Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos # Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred = clf.fit(train_data, train_target).predict(test_data) y_pred = clf.fit(train_data, train_target).predict(test_data)
......
#!/home/garciacumbreras18/anaconda3/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
###############################################################################
# Authors:
# Rocío López-Anguita (rlanguit@ujaen.es)
# Arturo Montejo-Ráez (amontejo@ujaen.es)
# Centro de Estudios Avanzados en TIC (CEATIC)
#
# Universidad de Jaén - 2018
###############################################################################
import json
import os
from ComplexityLanguage import ComplexityLanguage
from ComplexitySpanish import ComplexitySpanish
from ComplexityEnglish import ComplexityEnglish
from ComplexityFrench import ComplexityFrench
from ComplexityPolish import ComplexityPolish
from ComplexityItalian import ComplexityItalian
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn import preprocessing
import argparse
## ----------------------------------------------------------------------------
##
## Read command lines arguments
##
parser = argparse.ArgumentParser(description='PAN2018 author identificator based on text complexity metrics')
parser.add_argument('-i', type=str, help='input directory')
parser.add_argument('-o', type=str, help='output directory')
parser.add_argument('-x', '--axis', type=int, choices=[0,1], default=1, help='apply L2 normalization by sample (1, default) or by feature (0)')
args = parser.parse_args()
INPUT_DIR, OUTPUT_DIR = args.i, args.o
## ----------------------------------------------------------------------------
##
## Load of analyzers
##
print('Loading complexity analyzers for different languages...\n', flush=True)
mlComplexityText = {
'en': ComplexityEnglish(),
'sp': ComplexitySpanish(),
'fr': ComplexityFrench(),
'pl': ComplexityPolish(),
'it': ComplexityItalian()
}
## ----------------------------------------------------------------------------
##
## Corpus loading (both, train and test data sets)
##
complexity_known = pd.DataFrame()
complexity_unknown = pd.DataFrame()
labels = {}
labels_cand = []
#
# Recorremos todos los problemas
#
print('Loading collection-info.json file from', args.i, flush=True)
with open(INPUT_DIR+'/collection-info.json', 'r') as f:
collectionInfo = json.load(f)
f.close()
for problem in collectionInfo:
print('\n\nProblem: ', problem['problem-name'], flush=True)
print('Language: ', problem['language'], flush=True)
#
# Cargamos la clase para el cálculo de la complejidad del idioma correspondiente
#
complexityText = mlComplexityText[problem['language']]
#
# Recorremos todos los candidatos
#
print("Loading problem data...\n", flush=True)
with open(INPUT_DIR + '/' + problem['problem-name'] + '/problem-info.json', 'r') as problem_info_fhd:
problem_info= json.load(problem_info_fhd)
problem_info_fhd.close()
#
# Leemos los textos de autoría conocida (TEXTOS DE ENTRENAMIENTO)
#
print("Loading training data")
for candidate in problem_info['candidate-authors']:
print('Candidate: ', candidate['author-name'], flush=True)
files = os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], candidate['author-name']))
probcand = problem['problem-name'] + candidate['author-name']
if not probcand in labels:
labels[probcand] = len(labels)
labels_cand += [probcand]
#
# Procesamos todo los textos de ese candidato
#
for i, nameFile in enumerate(files):
print('Reading text file: ', nameFile, flush=True)
with open(os.path.join(os.path.join(INPUT_DIR,problem['problem-name'], candidate['author-name']), nameFile),'r') as context:
content = context.read()
context.close()
calcmetrics = complexityText.calcMetrics(content)
calcmetrics_ext = complexityText.calcMetricsExtend(content)
dfi = pd.DataFrame({**calcmetrics, **calcmetrics_ext}, index=[i])
dfi['problem'] = problem['problem-name']
dfi['language'] = problem['language']
dfi['candidate'] = candidate['author-name']
dfi['label'] = labels[probcand]
dfi['filename'] = nameFile
complexity_known = complexity_known.append([dfi])
#
# Si existe ground-truth, lo leemos para conocer los candidatos
#
unknown_candidates = False
if os.path.isfile(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json'):
print("Reading ground truth...", flush=True)
with open(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json', 'r') as ground_truth_fhd:
ground_truth = json.load(ground_truth_fhd)
ground_truth_fhd.close()
unknown_candidates = {}
for item in ground_truth['ground_truth']:
unknown_candidates[item['unknown-text']] = item['true-author']
#
# Recorremos archivos sin etiquetar (TEXTOS DE TEST)
#
print("Loading test data", flush=True)
for i, unknown_file in enumerate(os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], problem_info['unknown-folder']))):
print("Analyzing file", unknown_file, flush=True)
with open(INPUT_DIR + '/' + problem['problem-name'] + '/' + problem_info['unknown-folder'] + '/' + unknown_file, 'r') as unknown_fhd:
content = unknown_fhd.read()
unknown_fhd.close()
calcmetrics = complexityText.calcMetrics(content)
calcmetrics_ext = complexityText.calcMetricsExtend(content)
dfi = pd.DataFrame({**calcmetrics, **calcmetrics_ext}, index=[i])
dfi['problem'] = problem['problem-name']
dfi['language'] = problem['language']
if unknown_candidates and unknown_candidates[unknown_file]:
probcand = problem['problem-name'] + unknown_candidates[unknown_file]
dfi['candidate'] = unknown_candidates[unknown_file]
dfi['label'] = labels[probcand]
else:
dfi['candidate'] = None
dfi['label'] = None
dfi['filename'] = unknown_file
complexity_unknown = complexity_unknown.append([dfi])
## ----------------------------------------------------------------------------
##
## Training and classification
##
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
from sklearn import svm
clf = svm.LinearSVC(C=1)
for problem in set(complexity_known['problem']):
answers = []
print('------- Training and classifying ', problem, flush=True)
#
# Para el train cogemos los textos conocidos
#
train = complexity_known.loc[complexity_known['problem'] == problem]
train = train.dropna(axis=1, how='any')
train_target = train['label']
train_data = train.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
#
# Para el test cogemos los textos desconocidos
#
test = complexity_unknown.loc[complexity_unknown['problem'] == problem]
test = test.dropna(axis=1, how='any')
test_target = test['label']
test_data = test.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
#
# Normalizamos
#
data = pd.concat([train_data, test_data])
data = pd.DataFrame(preprocessing.normalize(data, norm='l2', axis=args.axis))
train_data = data.iloc[:train_data.shape[0],:]
test_data = data.iloc[train_data.shape[0]:,:]
# Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred = clf.fit(train_data, train_target).predict(test_data)
for index, row in test.iterrows():
probcand = labels_cand[y_pred[index]]
answers.append({
'unknown-text': row['filename'],
'predicted-author': probcand[probcand.find("candidate"):],
})
with open(OUTPUT_DIR + '/answers-' + problem +'.json', 'w') as file:
json.dump(answers, file, indent=4)
...@@ -178,8 +178,7 @@ for problem in set(postf['problem']): ...@@ -178,8 +178,7 @@ for problem in set(postf['problem']):
# Calculamos el modelo de espacio vectorial # Calculamos el modelo de espacio vectorial
# #
tfidfVectorizer = TfidfVectorizer(ngram_range=(1, args.ngramsize), use_idf=args.idf) tfidfVectorizer = TfidfVectorizer(ngram_range=(1, args.ngramsize), use_idf=args.idf)
postf['POStfidf'] = list(tfidfVectorizer.fit_transform(postf['Pos'])) postf['POStfidf'] = list(tfidfVectorizer.fit_transform(postf['Pos']))
# #
...@@ -190,7 +189,6 @@ for problem in set(postf['problem']): ...@@ -190,7 +189,6 @@ for problem in set(postf['problem']):
train = train.dropna(axis=1, how='any') train = train.dropna(axis=1, how='any')
train_target = train['label'] train_target = train['label']
train_data = np.array(list(train['POStfidf'].apply(lambda x: x.toarray()[0]))) train_data = np.array(list(train['POStfidf'].apply(lambda x: x.toarray()[0])))
train_data = pd.DataFrame(preprocessing.normalize(train_data, norm='l2', axis=args.axis))
# #
# Para el test cogemos los textos desconocidos # Para el test cogemos los textos desconocidos
...@@ -198,9 +196,15 @@ for problem in set(postf['problem']): ...@@ -198,9 +196,15 @@ for problem in set(postf['problem']):
test = postf[postf['filename'].str.contains(r"\bunknown", regex=True)] test = postf[postf['filename'].str.contains(r"\bunknown", regex=True)]
test = test.loc[test['problem'] == problem] test = test.loc[test['problem'] == problem]
test = test.dropna(axis=1, how='any') test = test.dropna(axis=1, how='any')
test_target = test['label']
test_data = np.array(list(test['POStfidf'].apply(lambda x: x.toarray()[0]))) test_data = np.array(list(test['POStfidf'].apply(lambda x: x.toarray()[0])))
train_data = pd.DataFrame(preprocessing.normalize(train_data, norm='l2', axis=args.axis))
#
# Normalizamos
#
data = pd.concat([pd.DataFrame(train_data), pd.DataFrame(test_data)])
data = pd.DataFrame(preprocessing.normalize(data, norm='l2', axis=args.axis))
train_data = data.iloc[:train_data.shape[0],:]
test_data = data.iloc[train_data.shape[0]:,:]
# Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos # Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred = clf.fit(train_data, train_target).predict(test_data) y_pred = clf.fit(train_data, train_target).predict(test_data)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment