classify_comp working on tira.io

8f96e893 · Arturo Montejo Ráez · 1df9d9e2 · 8f96e893 · 8f96e893 · 8f96e893
Commit 8f96e893 authored May 12, 2018 by Arturo Montejo Ráez
Showing with 228 additions and 65 deletions
.gitignore
ComplexityEnglish.py
ComplexityFrench.py
ComplexityItalian.py
ComplexityLanguage.py
ComplexitySpanish.py
classify_comp
classify_comp.py
--- a/.gitignore
+++ b/.gitignore
+__pycache__
--- a/ComplexityEnglish.py
+++ b/ComplexityEnglish.py
+import sys
+sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
 from ComplexityLanguage import ComplexityLanguage
 import re
 import math

--- a/ComplexityFrench.py
+++ b/ComplexityFrench.py
+import sys
+sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
 from ComplexityLanguage import ComplexityLanguage
 import freeling
 import os
@@ -15,9 +17,9 @@ class ComplexityFrench(ComplexityLanguage):
        ComplexityLanguage.__init__(self, lang)
       
        ## Modify this line to be your FreeLing installation directory
-        FREELINGDIR = "/usr/local"
-        DATA = FREELINGDIR+"/share/freeling/"
-        CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/"
+        FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
+        DATA = FREELINGDIR+"/data/"
+        CLASSDIR = ""
        self.lang = lang
        freeling.util_init_locale("default")

@@ -51,11 +53,7 @@ class ComplexityFrench(ComplexityLanguage):
        self.sen=freeling.senses(DATA+lang+"/senses.dat")
       
     
-        
-        #Listas de palabras de Dale-Chall
-        CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/"
-     
-        f = open(CLASSDIR + 'DaleChall.txt')
+        f = open(CLASSDIR + '/home/garciacumbreras18/DaleChall.txt')
        lines = f.readlines()
        f.close()


--- a/ComplexityItalian.py
+++ b/ComplexityItalian.py
 # -*- coding: utf-8 -*-
+import sys
+sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
 import freeling
 import os
 import re
@@ -12,9 +14,9 @@ class ComplexityItalian():
    def __init__(self, lang = 'it'):
               
        ## Modify this line to be your FreeLing installation directory
-        FREELINGDIR = "/usr/local"
-        DATA = FREELINGDIR+"/share/freeling/"
-        CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/"
+        FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
+        DATA = FREELINGDIR+"/data/"
+
        self.DATA = DATA
        self.lang = lang
        freeling.util_init_locale("default")

--- a/ComplexityLanguage.py
+++ b/ComplexityLanguage.py
 # -*- coding: utf-8 -*-
+
+import sys
+sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
+
 import freeling
 import os
 import re
@@ -12,8 +16,8 @@ class ComplexityLanguage():
    def __init__(self, lang):

        ## Modify this line to be your FreeLing installation directory
-        FREELINGDIR = "/usr/local"
-        DATA = FREELINGDIR+"/share/freeling/"
+        FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
+        DATA = FREELINGDIR+"/data/"
        self.DATA = DATA

        self.lang = lang
@@ -272,4 +276,3 @@ class ComplexityLanguage():
        self.pos_sentences = pos_sentences

        return self.pos_sentences
-    
\ No newline at end of file
--- a/ComplexitySpanish.py
+++ b/ComplexitySpanish.py
@@ -17,7 +17,7 @@ class ComplexitySpanish(ComplexityLanguage):
        self.dep=freeling.dep_txala(self.DATA+self.lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol())
        
        # Para leer el texto que introducimos
-        CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/"
+        CLASSDIR = "/home/garciacumbreras18/"
                
        f = open(CLASSDIR + 'CREA_total.txt')
        lines = f.readlines()

--- a/classify_comp
+++ b/classify_comp
-#!/usr/bin/python3
-#/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# Authors:
-# Rocío López-Anguita (rlanguit@ujaen.es)
-# Arturo Montejo-Ráez (amontejo@ujaen.es)
-# Centro de Estudios Avanzados en TIC (CEATIC)
-# Universidad de Jaén
-# 2018
-
-import json
-import os
-from ComplexityLanguage import ComplexityLanguage
-from ComplexitySpanish import ComplexitySpanish
-from ComplexityEnglish import ComplexityEnglish
-from ComplexityFrench import ComplexityFrench
-from ComplexityPolish import ComplexityPolish
-from ComplexityItalian import ComplexityItalian
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-from mpl_toolkits.mplot3d import Axes3D
-from sklearn.decomposition import PCA
-from sklearn import preprocessing
-import argparse
-
-parser = argparse.ArgumentParser(description='PAN2018 author identificator based on text complexity metrics')
-parser.add_argument('-i', type=string, help='input directory')
-parser.add_argument('-o', type=string, help='output directory')
-args = parser.parse_args()
-print(args.i, args.o)
-exit()
-print('Loading complexity analyzers for different languages...\n', flush=True)
-mlComplexityText = {
-    'en': ComplexityEnglish(),
-    'sp': ComplexitySpanish(),
-    'fr': ComplexityFrench(),
-    'pl': ComplexityPolish(),
-    'it': ComplexityItalian()
-}
-
-INPUT_DIR = args.i
-OUTPUT_DIR = args.o
-with open(INPUT_DIR+'/collection-info.json', 'r') as f:      
-    collectionInfo = json.load(f)  
-    f.close()
-print(type(collectionInfo))
\ No newline at end of file
--- a/classify_comp.py
+++ b/classify_comp.py
+#!/home/garciacumbreras18/anaconda3/bin/python3
+#/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###############################################################################
+# Authors:
+# Rocío López-Anguita (rlanguit@ujaen.es)
+# Arturo Montejo-Ráez (amontejo@ujaen.es)
+# Centro de Estudios Avanzados en TIC (CEATIC)
+#
+# Universidad de Jaén - 2018
+###############################################################################
+
+import json
+import os
+from ComplexityLanguage import ComplexityLanguage
+from ComplexitySpanish import ComplexitySpanish
+from ComplexityEnglish import ComplexityEnglish
+from ComplexityFrench import ComplexityFrench
+from ComplexityPolish import ComplexityPolish
+from ComplexityItalian import ComplexityItalian
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+from sklearn.decomposition import PCA
+from sklearn import preprocessing
+import argparse
+
+## ----------------------------------------------------------------------------
+##
+## Read command lines arguments
+##
+
+parser = argparse.ArgumentParser(description='PAN2018 author identificator based on text complexity metrics')
+parser.add_argument('-i', type=str, help='input directory')
+parser.add_argument('-o', type=str, help='output directory')
+args = parser.parse_args()
+INPUT_DIR, OUTPUT_DIR = args.i, args.o
+
+## ----------------------------------------------------------------------------
+##
+## Load of analyzers
+##
+
+print('Loading complexity analyzers for different languages...\n', flush=True)
+mlComplexityText = {
+    'en': ComplexityEnglish(),
+    'sp': ComplexitySpanish(),
+    'fr': ComplexityFrench(),
+    'pl': ComplexityPolish(),
+    'it': ComplexityItalian()
+}
+
+## ----------------------------------------------------------------------------
+##
+## Corpus loading (both, train and test data sets)
+##
+
+complexity_known = pd.DataFrame()
+complexity_unknown = pd.DataFrame()
+labels = {}
+labels_cand = []
+
+#
+# Recorremos todos los problemas
+#
+
+print('Loading collection-info.json file from', args.i, flush=True)
+with open(INPUT_DIR+'/collection-info.json', 'r') as f:      
+    collectionInfo = json.load(f)  
+    f.close()
+
+for problem in collectionInfo:
+    print('\n\nProblem: ', problem['problem-name'], flush=True)
+    print('Language: ', problem['language'], flush=True)
+    
+    #
+    # Cargamos la clase para el cálculo de la complejidad del idioma correspondiente
+    #
+    complexityText = mlComplexityText[problem['language']]
+    
+    #
+    # Recorremos todos los candidatos
+    #
+    print("Loading problem data...\n", flush=True)
+    with open(INPUT_DIR + '/' + problem['problem-name'] + '/problem-info.json', 'r') as problem_info_fhd:
+        problem_info= json.load(problem_info_fhd)
+        problem_info_fhd.close()
+    
+    #
+    # Leemos los textos de autoría conocida (TEXTOS DE ENTRENAMIENTO)
+    #
+    print("Loading training data")
+    for candidate in problem_info['candidate-authors']:
+        
+        print('Candidate: ', candidate['author-name'], flush=True)
+
+        files = os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], candidate['author-name']))
+        
+        probcand = problem['problem-name'] + candidate['author-name']
+        if not probcand in labels:
+            labels[probcand] = len(labels)
+            labels_cand += [probcand]
+        
+        #
+        # Procesamos todo los textos de ese candidato
+        #
+        for i, nameFile in enumerate(files):
+            print('Reading text file: ', nameFile, flush=True)
+
+            with open(os.path.join(os.path.join(INPUT_DIR,problem['problem-name'], candidate['author-name']), nameFile),'r') as context:
+                calcmetrics = complexityText.calcMetrics(context.read())
+                               
+                dfi = pd.DataFrame(calcmetrics, index=[i])
+                dfi['problem'] = problem['problem-name']
+                dfi['language'] = problem['language']
+                dfi['candidate'] = candidate['author-name']
+                dfi['label'] = labels[probcand]
+                dfi['filename'] = nameFile
+                complexity_known = complexity_known.append([dfi])   
+
+    #
+    # Si existe ground-truth, lo leemos para conocer los candidatos
+    #
+    unknown_candidates = False
+    if os.path.isfile(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json'):
+        print("Reading ground truth...", flush=True)
+        with open(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json', 'r') as ground_truth_fhd:
+            ground_truth = json.load(ground_truth_fhd)
+            ground_truth_fhd.close()
+            unknown_candidates = {}
+            for item in ground_truth['ground_truth']:
+                unknown_candidates[item['unknown-text']] = item['true-author']
+    
+    #
+    # Recorremos archivos sin etiquetar (TEXTOS DE TEST)
+    #
+    print("Loading test data", flush=True)
+    
+    for i, unknown_file in enumerate(os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], problem_info['unknown-folder']))):
+        print("Analyzing file", unknown_file, flush=True)
+        with open(INPUT_DIR + '/' + problem['problem-name'] + '/' + problem_info['unknown-folder'] + '/' + unknown_file, 'r') as unknown_fhd:
+            calcmetrics = complexityText.calcMetrics(unknown_fhd.read())
+            unknown_fhd.close()
+            
+            dfi = pd.DataFrame(calcmetrics, index=[i])
+            dfi['problem'] = problem['problem-name']
+            dfi['language'] = problem['language']
+            if unknown_candidates and unknown_candidates[unknown_file]:
+                probcand = problem['problem-name'] + unknown_candidates[unknown_file]                   
+                dfi['candidate'] = unknown_candidates[unknown_file]
+                dfi['label'] = labels[probcand]
+            
+            else:
+                dfi['candidate'] = None
+                dfi['label'] = None
+            dfi['filename'] = unknown_file
+            complexity_unknown = complexity_unknown.append([dfi])
+
+## ----------------------------------------------------------------------------
+##
+## Training and classification
+##
+
+if not os.path.exists(OUTPUT_DIR):
+    os.makedirs(OUTPUT_DIR)
+
+from sklearn import svm
+clf = svm.LinearSVC(C=1)
+for problem in set(complexity_known['problem']):
+
+    answers = []
+
+    print('------- Training and classifying ', problem, flush=True)
+    
+    #
+    # Para el train cogemos los textos conocidos
+    #
+    train = complexity_known.loc[complexity_known['problem'] == problem]
+    train = train.dropna(axis=1, how='any')
+    train_target = train['label']
+    train_data= train.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
+    train_data = pd.DataFrame(preprocessing.normalize(train_data, norm='l2'))
+
+    #
+    # Para el test cogemos los textos desconocidos
+    #
+    test = complexity_unknown.loc[complexity_unknown['problem'] == problem]
+    test = test.dropna(axis=1, how='any')
+    test_target = test['label']    
+    test_data = test.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
+    test_data = pd.DataFrame(preprocessing.normalize(test_data, norm='l2'))
+
+    #Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
+    y_pred = clf.fit(train_data, train_target).predict(test_data)
+    
+    for index, row in test.iterrows():
+        probcand = labels_cand[y_pred[index]]
+        answers.append({
+            'unknown-text': row['filename'],
+            'predicted-author': probcand[probcand.find("candidate"):],
+        })
+    
+    with open(OUTPUT_DIR + '/answers-' + problem +'.json', 'w') as file:
+        json.dump(answers, file, indent=4)
+