new version

parent 8f96e893
import nltk import sys
sys.path.append('/home/garciacumbreras18/dist/treetagger')
import nltk
import re import re
from treetagger import TreeTagger
class ComplexityPolish(): class ComplexityPolish():
def __init__(self, lang= 'pl'): def __init__(self, lang= 'pl'):
""" """
config es una lista de valores booleanos que activa o desactivan el cálculo de una medida config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
config = [ config = [
True|False, # PUNCTUATION MARKS True|False, # PUNCTUATION MARKS
True|False, # ARI True|False, # ARI
True|False, # FOG True|False, # FOG
True|False, # FLESCH True|False, # FLESCH
True|False, # FLESCH-KINCAID True|False, # FLESCH-KINCAID
True|False, # PISAREK True|False, # PISAREK
] ]
Si config == None se calculan todas las métricas de complejidad soportadas Si config == None se calculan todas las métricas de complejidad soportadas
""" """
self.config = [True, True, True, True, True, True] self.config = [True, True, True, True, True, True]
self.metricsStr = ['AVERAGE PUNCTUATION MARKS', 'ARI', 'FOG', 'FLESCH', 'FLESCH-KINCAID', 'PISAREK'] self.metricsStr = ['AVERAGE PUNCTUATION MARKS', 'ARI', 'FOG', 'FLESCH', 'FLESCH-KINCAID', 'PISAREK']
pass pass
def textProcessing(self, text): def textProcessing(self, text):
text = text.replace(u'\xa0', u' ') text = text.replace(u'\xa0', u' ')
''' '''
...@@ -41,7 +45,7 @@ class ComplexityPolish(): ...@@ -41,7 +45,7 @@ class ComplexityPolish():
N_text_tokens = len(self.text_tokens) N_text_tokens = len(self.text_tokens)
self.N_text_tokens = N_text_tokens self.N_text_tokens = N_text_tokens
#print('Tokens: ', self.N_text_tokens) #print('Tokens: ', self.N_text_tokens)
# y ahora reorganizamos las oraciones a partir de los puntos aislados # y ahora reorganizamos las oraciones a partir de los puntos aislados
sentences = [] sentences = []
ini = 0 ini = 0
...@@ -60,8 +64,8 @@ class ComplexityPolish(): ...@@ -60,8 +64,8 @@ class ComplexityPolish():
N_sentences = len(sentences) N_sentences = len(sentences)
self.N_sentences = N_sentences self.N_sentences = N_sentences
#print('Sentences: ',self.sentences) #print('Sentences: ',self.sentences)
N_charac=0 N_charac=0
for word in self.text_tokens: for word in self.text_tokens:
...@@ -79,15 +83,15 @@ class ComplexityPolish(): ...@@ -79,15 +83,15 @@ class ComplexityPolish():
count+=1 count+=1
if count>=3: if count>=3:
N_syllables3 += 1 N_syllables3 += 1
self.N_syllables = N_syllables self.N_syllables = N_syllables
self.N_syllables3 = N_syllables3 self.N_syllables3 = N_syllables3
#print('The number of syllables is: ',self.N_syllables) #print('The number of syllables is: ',self.N_syllables)
#print('The number of syllables3 is: ', self.N_syllables3) #print('The number of syllables3 is: ', self.N_syllables3)
return self.text_tokens, self.N_text_tokens, self.sentences, self.N_sentences, self.N_charac, self.N_syllables, self.N_syllables3 return self.text_tokens, self.N_text_tokens, self.sentences, self.N_sentences, self.N_charac, self.N_syllables, self.N_syllables3
def punctuationMarks(self): def punctuationMarks(self):
N_punctuation = 0 N_punctuation = 0
letters = [] letters = []
...@@ -96,34 +100,34 @@ class ComplexityPolish(): ...@@ -96,34 +100,34 @@ class ComplexityPolish():
if re.match('[a-zA-Z]|á|ó|í|ú|é', word): if re.match('[a-zA-Z]|á|ó|í|ú|é', word):
letters.append(word) letters.append(word)
N_letters+=len(word) N_letters+=len(word)
else: else:
N_punctuation += 1 N_punctuation += 1
self.words = letters self.words = letters
self.N_words = len(letters) self.N_words = len(letters)
#print('N_words: ', self.N_words) #print('N_words: ', self.N_words)
self.N_letters = N_letters self.N_letters = N_letters
self.N_punctuation = N_punctuation self.N_punctuation = N_punctuation
if self.N_words == 0: if self.N_words == 0:
punctuation_over_words = 0 punctuation_over_words = 0
else: else:
punctuation_over_words = self.N_punctuation / self.N_words punctuation_over_words = self.N_punctuation / self.N_words
self.punctuation_over_words = punctuation_over_words self.punctuation_over_words = punctuation_over_words
#print('The number of letter is: ', N_letters) #print('The number of letter is: ', N_letters)
#print('The list of letter is: ', letters) #print('The list of letter is: ', letters)
#print('The PUNCTUATION MARKS is: ', self.N_punctuation, '\n') #print('The PUNCTUATION MARKS is: ', self.N_punctuation, '\n')
return self.punctuation_over_words, self.N_punctuation, self.words, self.N_words, self.N_letters return self.punctuation_over_words, self.N_punctuation, self.words, self.N_words, self.N_letters
def readability(self): def readability(self):
ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words / self.N_sentences -21.43 ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words / self.N_sentences -21.43
self.ARI = ARI self.ARI = ARI
#print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n') #print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
fogreadability = 0.4 * ( self.N_words / self.N_sentences + 100 * self.N_syllables3 / self.N_words) fogreadability = 0.4 * ( self.N_words / self.N_sentences + 100 * self.N_syllables3 / self.N_words)
self.fogreadability = fogreadability self.fogreadability = fogreadability
#print("FOG: ", self.fogreadability, "\n") #print("FOG: ", self.fogreadability, "\n")
...@@ -133,29 +137,29 @@ class ComplexityPolish(): ...@@ -133,29 +137,29 @@ class ComplexityPolish():
#print("Syllables:", self.N_syllables) #print("Syllables:", self.N_syllables)
#print("Sentences:", self.N_sentences) #print("Sentences:", self.N_sentences)
#print("FLESCH: ", self.fleschreadability, "\n") #print("FLESCH: ", self.fleschreadability, "\n")
fkincaidreadability = - 15.59 + 11.8 * (self.N_syllables / self.N_words) + 0.39 * (self.N_words / self.N_sentences) fkincaidreadability = - 15.59 + 11.8 * (self.N_syllables / self.N_words) + 0.39 * (self.N_words / self.N_sentences)
self.fkincaidreadability = fkincaidreadability self.fkincaidreadability = fkincaidreadability
#print("FLESCH-KINCAID: ", self.fkincaidreadability, "\n") #print("FLESCH-KINCAID: ", self.fkincaidreadability, "\n")
self.fkincaidreadability = fkincaidreadability self.fkincaidreadability = fkincaidreadability
pisarekreadability = (self.N_words / self.N_sentences)/3 + self.N_syllables3/3 +1 pisarekreadability = (self.N_words / self.N_sentences)/3 + self.N_syllables3/3 +1
self.pisarekreadability = pisarekreadability self.pisarekreadability = pisarekreadability
#print("PISAREK (2007): ", self.pisarekreadability, "\n") #print("PISAREK (2007): ", self.pisarekreadability, "\n")
return self.ARI, self.fogreadability, self.fleschreadability, self.fkincaidreadability, self.pisarekreadability return self.ARI, self.fogreadability, self.fleschreadability, self.fkincaidreadability, self.pisarekreadability
def calcMetrics(self, text): def calcMetrics(self, text):
self.textProcessing(text) self.textProcessing(text)
metrics = {} metrics = {}
metricsPo = self.metricsStr metricsPo = self.metricsStr
readability = None readability = None
for i in range(0, len(metricsPo)): for i in range(0, len(metricsPo)):
if self.config == None or self.config[i] and metricsPo[i] == 'AVERAGE PUNCTUATION MARKS': if self.config == None or self.config[i] and metricsPo[i] == 'AVERAGE PUNCTUATION MARKS':
punctuationmarks = self.punctuationMarks() punctuationmarks = self.punctuationMarks()
metrics['AVERAGE PUNCTUATION MARKS'] = punctuationmarks[0] metrics['AVERAGE PUNCTUATION MARKS'] = punctuationmarks[0]
...@@ -174,6 +178,23 @@ class ComplexityPolish(): ...@@ -174,6 +178,23 @@ class ComplexityPolish():
if self.config == None or self.config[i] and metricsPo[i] == 'PISAREK': if self.config == None or self.config[i] and metricsPo[i] == 'PISAREK':
if not readability: readability = self.readability() if not readability: readability = self.readability()
metrics['PISAREK'] = readability[4] metrics['PISAREK'] = readability[4]
return metrics return metrics
\ No newline at end of file def getPOS(self, text):
tt = TreeTagger(language='polish')
sentences = tt.tag(text)
pos_sentences = []
sent = []
for w in sentences:
tag = w[1].split(':')[0]
if tag == 'SENT':
pos_sentences.append(sent)
sent = []
else:
sent += [tag]
self.pos_sentences = pos_sentences
return self.pos_sentences
...@@ -192,7 +192,7 @@ for problem in set(complexity_known['problem']): ...@@ -192,7 +192,7 @@ for problem in set(complexity_known['problem']):
test_data = test.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1) test_data = test.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
test_data = pd.DataFrame(preprocessing.normalize(test_data, norm='l2')) test_data = pd.DataFrame(preprocessing.normalize(test_data, norm='l2'))
#Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos # Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred = clf.fit(train_data, train_target).predict(test_data) y_pred = clf.fit(train_data, train_target).predict(test_data)
for index, row in test.iterrows(): for index, row in test.iterrows():
......
#!/home/garciacumbreras18/anaconda3/bin/python3
#/usr/bin/env python
# -*- coding: utf-8 -*-
###############################################################################
# Authors:
# Rocío López-Anguita (rlanguit@ujaen.es)
# Arturo Montejo-Ráez (amontejo@ujaen.es)
# Centro de Estudios Avanzados en TIC (CEATIC)
#
# Universidad de Jaén - 2018
###############################################################################
import json
import os
from ComplexityLanguage import ComplexityLanguage
from ComplexitySpanish import ComplexitySpanish
from ComplexityEnglish import ComplexityEnglish
from ComplexityFrench import ComplexityFrench
from ComplexityPolish import ComplexityPolish
from ComplexityItalian import ComplexityItalian
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import argparse
## ----------------------------------------------------------------------------
##
## Read command line arguments
##
parser = argparse.ArgumentParser(description='PAN2018 author identificator based on POS vectors')
parser.add_argument('-i', '--input', type=str, help='input directory')
parser.add_argument('-o', '--output', type=str, help='output directory')
parser.add_argument('-n', '--ngramsize', type=int, help='maximum n-gram size', choices=[1,2,3], default=2)
parser.add_argument('-f', '--idf', action='store_true', help='apply inverse document frequency', default=False)
args = parser.parse_args()
INPUT_DIR, OUTPUT_DIR = args.input, args.output
## ----------------------------------------------------------------------------
##
## Load of analyzers
##
print('Loading complexity analyzers for different languages...\n', flush=True)
mlComplexityText = {
'en': ComplexityEnglish(),
'sp': ComplexitySpanish(),
'fr': ComplexityFrench(),
'pl': ComplexityPolish(),
'it': ComplexityItalian()
}
## ----------------------------------------------------------------------------
##
## Corpus loading (both, train and test data sets)
##
postf = pd.DataFrame()
labels = {}
labels_cand = []
#
# Recorremos todos los problemas
#
print('Loading collection-info.json file from', INPUT_DIR, flush=True)
with open(INPUT_DIR+'/collection-info.json', 'r') as f:
collectionInfo = json.load(f)
f.close()
for problem in collectionInfo:
print('\n\nProblem: ', problem['problem-name'], flush=True)
print('Language: ', problem['language'], flush=True)
#
# Cargamos la clase para el cálculo de la complejidad del idioma correspondiente
#
complexityText = mlComplexityText[problem['language']]
#
# Recorremos todos los candidatos
#
print("Loading problem data...\n", flush=True)
with open(INPUT_DIR + '/' + problem['problem-name'] + '/problem-info.json', 'r') as problem_info_fhd:
problem_info= json.load(problem_info_fhd)
problem_info_fhd.close()
#
# Leemos los textos de autoría conocida (TEXTOS DE ENTRENAMIENTO)
#
print("Loading training data")
for candidate in problem_info['candidate-authors']:
print('Candidate: ', candidate['author-name'], flush=True)
files = os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], candidate['author-name']))
probcand = problem['problem-name'] + candidate['author-name']
if not probcand in labels:
labels[probcand] = len(labels)
labels_cand += [probcand]
#
# Procesamos todo los textos de ese candidato
#
for i, nameFile in enumerate(files):
print('Reading text file: ', nameFile, flush=True)
with open(os.path.join(os.path.join(INPUT_DIR,problem['problem-name'], candidate['author-name']), nameFile),'r') as fhnd:
postags = complexityText.getPOS(fhnd.read())
fhnd.close()
postags = " ".join([" ".join(p) for p in postags])
dfi = pd.DataFrame({'Pos': postags}, index=[i])
dfi['problem'] = problem['problem-name']
dfi['language'] = problem['language']
dfi['candidate'] = candidate['author-name']
dfi['label'] = labels[probcand]
dfi['filename'] = nameFile
postf = postf.append([dfi])
#
# Si existe ground-truth, lo leemos para conocer los candidatos
#
unknown_candidates = False
if os.path.isfile(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json'):
print("Reading ground truth...", flush=True)
with open(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json', 'r') as fhnd:
ground_truth = json.load(fhnd)
fhnd.close()
unknown_candidates = {}
for item in ground_truth['ground_truth']:
unknown_candidates[item['unknown-text']] = item['true-author']
#
# Recorremos archivos sin etiquetar (TEXTOS DE TEST)
#
print("Loading test data", flush=True)
for i, unknown_file in enumerate(os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], problem_info['unknown-folder']))):
print("Analyzing file", unknown_file, flush=True)
with open(INPUT_DIR + '/' + problem['problem-name'] + '/' + problem_info['unknown-folder'] + '/' + unknown_file, 'r') as fhnd:
postags = complexityText.getPOS(fhnd.read())
fhnd.close()
postags = " ".join([" ".join(p) for p in postags])
dfi = pd.DataFrame({'Pos': postags}, index=[i])
dfi['problem'] = problem['problem-name']
dfi['language'] = problem['language']
if unknown_candidates and unknown_candidates[unknown_file]:
probcand = problem['problem-name'] + unknown_candidates[unknown_file]
dfi['candidate'] = unknown_candidates[unknown_file]
dfi['label'] = labels[probcand]
else:
dfi['candidate'] = None
dfi['label'] = None
dfi['filename'] = unknown_file
postf = postf.append([dfi])
## ----------------------------------------------------------------------------
##
## Training and classification
##
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
from sklearn import svm
clf = svm.LinearSVC(C=1)
for problem in set(postf['problem']):
answers = []
print('------- Training and classifying ', problem, flush=True)
#
# Calculamos el modelo de espacio vectorial
#
tfidfVectorizer = TfidfVectorizer(ngram_range=(1, args.ngramsize), use_idf=args.idf, norm='l2')
postf['POStfidf'] = list(tfidfVectorizer.fit_transform(postf['Pos']))
#
# Para el train cogemos los textos conocidos
#
train = postf[postf['filename'].str.contains(r"\bknown", regex=True)]
train = train.loc[train['problem'] == problem]
train = train.dropna(axis=1, how='any')
train_target = train['label']
train_data = np.array(list(train['POStfidf'].apply(lambda x: x.toarray()[0])))
#
# Para el test cogemos los textos desconocidos
#
test = postf[postf['filename'].str.contains(r"\bunknown", regex=True)]
test = test.loc[test['problem'] == problem]
test = test.dropna(axis=1, how='any')
test_target = test['label']
test_data = np.array(list(test['POStfidf'].apply(lambda x: x.toarray()[0])))
# Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
y_pred = clf.fit(train_data, train_target).predict(test_data)
for index, row in test.iterrows():
probcand = labels_cand[y_pred[index]]
answers.append({
'unknown-text': row['filename'],
'predicted-author': probcand[probcand.find("candidate"):],
})
with open(OUTPUT_DIR + '/answers-' + problem +'.json', 'w') as file:
json.dump(answers, file, indent=4)
print("done!")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment