classify_comp working on tira.io

8f96e893 · Arturo Montejo Ráez · 1df9d9e2 · 8f96e893 · 8f96e893 · 8f96e893
Commit 8f96e893 authored May 12, 2018 by Arturo Montejo Ráez
Showing with 295 additions and 136 deletions
.gitignore
ComplexityEnglish.py
ComplexityFrench.py
ComplexityItalian.py
ComplexityLanguage.py
ComplexitySpanish.py
classify_comp
classify_comp.py
--- a/.gitignore
+++ b/.gitignore
+__pycache__
--- a/ComplexityEnglish.py
+++ b/ComplexityEnglish.py
+import sys
+sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
 from ComplexityLanguage import ComplexityLanguage
 import re
 import math
@@ -179,4 +181,4 @@ class ComplexityEnglish(ComplexityLanguage):
        
             
        return metrics 
- 
\ No newline at end of file
+ 
--- a/ComplexityFrench.py
+++ b/ComplexityFrench.py
+import sys
+sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
 from ComplexityLanguage import ComplexityLanguage
 import freeling
 import os
@@ -15,9 +17,9 @@ class ComplexityFrench(ComplexityLanguage):
        ComplexityLanguage.__init__(self, lang)
       
        ## Modify this line to be your FreeLing installation directory
-        FREELINGDIR = "/usr/local"
-        DATA = FREELINGDIR+"/share/freeling/"
-        CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/"
+        FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
+        DATA = FREELINGDIR+"/data/"
+        CLASSDIR = ""
        self.lang = lang
        freeling.util_init_locale("default")

@@ -50,12 +52,8 @@ class ComplexityFrench(ComplexityLanguage):
        self.tg=freeling.hmm_tagger(DATA+lang+"/tagger.dat",True,2)
        self.sen=freeling.senses(DATA+lang+"/senses.dat")
       
-        
-        
-        #Listas de palabras de Dale-Chall
-        CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/"
     
-        f = open(CLASSDIR + 'DaleChall.txt')
+        f = open(CLASSDIR + '/home/garciacumbreras18/DaleChall.txt')
        lines = f.readlines()
        f.close()

@@ -143,4 +141,4 @@ class ComplexityFrench(ComplexityLanguage):
                
        return metrics 
    
-    
\ No newline at end of file
+    
--- a/ComplexityItalian.py
+++ b/ComplexityItalian.py
 # -*- coding: utf-8 -*-
+import sys
+sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
 import freeling
 import os
 import re
@@ -12,9 +14,9 @@ class ComplexityItalian():
    def __init__(self, lang = 'it'):
               
        ## Modify this line to be your FreeLing installation directory
-        FREELINGDIR = "/usr/local"
-        DATA = FREELINGDIR+"/share/freeling/"
-        CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/"
+        FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
+        DATA = FREELINGDIR+"/data/"
+
        self.DATA = DATA
        self.lang = lang
        freeling.util_init_locale("default")
@@ -279,4 +281,4 @@ class ComplexityItalian():
     

     
-     
\ No newline at end of file
+     
--- a/ComplexityLanguage.py
+++ b/ComplexityLanguage.py
 # -*- coding: utf-8 -*-
+
+import sys
+sys.path.append('/home/garciacumbreras18/dist/freeling/APIs/python')
+
 import freeling
 import os
 import re
@@ -8,14 +12,14 @@ import scipy.stats
 import math

 class ComplexityLanguage():
-            
+
    def __init__(self, lang):
-               
+
        ## Modify this line to be your FreeLing installation directory
-        FREELINGDIR = "/usr/local"
-        DATA = FREELINGDIR+"/share/freeling/"
+        FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
+        DATA = FREELINGDIR+"/data/"
        self.DATA = DATA
-        
+
        self.lang = lang
        freeling.util_init_locale("default")

@@ -24,49 +28,49 @@ class ComplexityLanguage():

        # create options set for maco analyzer. Default values are Ok, except for data files.
        op= freeling.maco_options(lang)
-        op.set_data_files( "", 
+        op.set_data_files( "",
           self.DATA + "common/punct.dat",
           self.DATA + self.lang + "/dicc.src",
           self.DATA + self.lang + "/afixos.dat",
           "",
-           self.DATA + self.lang + "/locucions.dat", 
+           self.DATA + self.lang + "/locucions.dat",
           self.DATA + self.lang + "/np.dat",
           self.DATA + self.lang + "/quantities.dat",
           self.DATA + self.lang + "/probabilitats.dat")

        # create analyzers
        self.tk=freeling.tokenizer(self.DATA+self.lang+"/tokenizer.dat")
-        #self.sp=freeling.splitter("/home/sinai/Freeling/data/"+self.lang+"/splitter.dat")        
-        self.sp=freeling.splitter(self.DATA+self.lang+"/splitter.dat")        
+        #self.sp=freeling.splitter("/home/sinai/Freeling/data/"+self.lang+"/splitter.dat")
+        self.sp=freeling.splitter(self.DATA+self.lang+"/splitter.dat")
        self.mf=freeling.maco(op)

        # activate mmorpho modules to be used in next call
-        self.mf.set_active_options(False, True, True, True,  # select which among created 
-                                   True, True, False, True,  # submodules are to be used. 
-                                   True, True, True, True )  # default: all created submodules are used     
-        
+        self.mf.set_active_options(False, True, True, True,  # select which among created
+                                   True, True, False, True,  # submodules are to be used.
+                                   True, True, True, True )  # default: all created submodules are used
+
        # create tagger, sense anotator, and parsers
        self.tg=freeling.hmm_tagger(self.DATA+self.lang+"/tagger.dat",True,2)
        self.sen=freeling.senses(self.DATA+self.lang+"/senses.dat")
        #self.parser= freeling.chart_parser(DATA+lang+"/chunker/grammar-chunk.dat")
        #self.dep=freeling.dep_txala(DATA+lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol())
-        
-        """ 
+
+        """
        config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
        config = [
            True|False,         # PUNCTUATION MARKS
            True|False,         # SCI
-            True|False,         # ARI 
+            True|False,         # ARI
            True|False,         # MU
            ]
         Si config == None se calculan todas las métricas de complejidad soportadas
        """
        self.config = [True, True, True, True]
        self.metricsStr = ['AVERAGE PUNCTUATION MARKS', 'SCI', 'ARI', 'MU']
-       
+
    pass
-                         
-    
+
+
    def textProcessing(self, text):
        text = text.replace(u'\xa0', u' ').replace('"', '')
        # meter todas las funciones en una patron de los tokens válidos
@@ -86,9 +90,9 @@ class ComplexityLanguage():
        #ls = self.dep.analyze(ls)
        #print("After dependencies", len(ls))
        self.sentences = ls
-        self.N_sentences = len(ls)       
+        self.N_sentences = len(ls)
        self.sp.close_session(sid)
-        
+
        #print('Las oraciones: ', self.sentences)
        '''
        Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos
@@ -97,11 +101,11 @@ class ComplexityLanguage():
        for sentence in self.sentences:
            ws = sentence.get_words();
            pos_content_sentences.append([w for w in ws if re.match('N.*|V.*|A.*', w.get_tag())])
-        self.pos_content_sentences = pos_content_sentences 
-           
-        return self.pos_content_sentences, self.sentences, self.N_sentences 
-       
-    
+        self.pos_content_sentences = pos_content_sentences
+
+        return self.pos_content_sentences, self.sentences, self.N_sentences
+
+
    def punctuationMarks(self):
        #Solo nos interesa contar los tokens que sean signo de puntuación.
        #Number of words.
@@ -114,10 +118,10 @@ class ComplexityLanguage():
                else:
                    lwords.append(w.get_form())

-        
+
        self.N_words = len(lwords)
        #print('Number of words (N_w): ', self.N_words, '\n' )
-        
+
        self.N_punctuation = len(punctuation)
        self.punctuation = punctuation

@@ -125,14 +129,14 @@ class ComplexityLanguage():
            punctuation_over_words = 0
        else:
            punctuation_over_words = self.N_punctuation / self.N_words
-            
+
        self.punctuation_over_words = punctuation_over_words
        #print("PUNCTUATION MARKS = ", self.N_punctuation,'\n')
-        
+
        return self.punctuation_over_words, self.N_punctuation, self.punctuation, self.N_words
-    
+
    def sentenceComplexity(self):
-                        
+
        #Number of complex sentences
        N_cs = 0
        for sentence in self.sentences:
@@ -149,25 +153,25 @@ class ComplexityLanguage():
                    else:
                        previous_is_verb = False
                if count>0:
-                    N_cs += 1   
+                    N_cs += 1
        self.N_cs = N_cs
        #print("Number of complex sentences: ", self.N_cs, "\n")
-      
+
        ASL = self.N_words / self.N_sentences
        self.ASL = ASL
        #print("Average Sentence Length (ASL) = ", self.ASL, '\n')
-                
+
        CS = self.N_cs / self.N_sentences
        self.CS = CS
        #print("Complex Sentences (CS) = ", self.CS, '\n')
-        
+
        SCI = (ASL + CS)/ 2
        self.SCI = SCI
        #print("SENTENCE COMPLEXITY INDEX:(SCI) = ", self.SCI, "\n")
-        
-        return self.SCI, self.CS, self.N_cs, self.ASL 
-    
-    
+
+        return self.SCI, self.CS, self.N_cs, self.ASL
+
+
    def autoReadability(self):
        #Number of characters
        count = 0
@@ -178,24 +182,24 @@ class ComplexityLanguage():
                    count +=1
                else:
                    listwords.append(w.get_form())
-                          
-        self.listwords = listwords 
+
+        self.listwords = listwords
        N_charac = 0
        for characters in self.listwords:
            N_charac += len(characters)
-         
+
        self.N_charac = N_charac
        #print("Number of characters: ", self.N_charac, "\n")
-       
+
        ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words / self.N_sentences - 21.43
        self.ARI = ARI
        #print("AUTOMATED READABILITY INDEX (ARI) = ", self.ARI, '\n')
-        
-        return self.ARI, self.N_charac,  self.listwords        
-     
-      
+
+        return self.ARI, self.N_charac,  self.listwords
+
+
    def mureadability(self):
-        
+
        #Number of syllables and Number of words with 3 or more syllables:tagger
        N_syllables = 0
        N_syllables3 = 0
@@ -207,10 +211,10 @@ class ComplexityLanguage():
                    count+=1
            if count>=3:
                N_syllables3 += 1
-                                  
+
        self.N_syllables = N_syllables
        self.N_syllables3 = N_syllables3
-        
+
        #Number of letters
        N_letters= 0
        letters = []
@@ -220,33 +224,33 @@ class ComplexityLanguage():
                    letters.append(word)
                    N_letters+=len(word)
                    vecletters.append(len(word))
-                    
+
        self.letters = letters
        self.N_letters = N_letters
        self.vecletters = vecletters
-        
+
        x=self.N_letters / self.N_words
        varianza=np.var(self.vecletters)
-        
+
        mu = (self.N_words /(self.N_words-1))*(x/varianza)*100
        #print("READABILITY MU: ", mu, "\n")
        self.mu = mu
-      
+
        return  self.mu, self.N_syllables, self.N_syllables3, self.letters, self.N_letters, self.vecletters

    def calcMetrics(self, text):
-        """ 
-        Calcula la métricas de complejidad activadas en la configuración 
-        """ 
+        """
+        Calcula la métricas de complejidad activadas en la configuración
+        """
        self.textProcessing(text)
        metrics = {}
-        
+
        punctuationMarks = None
        autoreadability = None
        sentencecomplexity = None
-        
+
        for i in range(0, len(self.metricsStr)):
-            
+
            if self.config == None or self.config[i] and self.metricsStr[i] == 'AVERAGE PUNCTUATION MARKS':
                punctuationmarks = self.punctuationMarks()
                metrics['AVERAGE PUNCTUATION MARKS'] = punctuationmarks[0]
@@ -259,9 +263,9 @@ class ComplexityLanguage():
            if self.config == None or self.config[i] and self.metricsStr[i] == 'MU':
                mureadability = self. mureadability()
                metrics['MU'] = mureadability[0]
-                      
-        return metrics    
-   
+
+        return metrics
+
    def getPOS(self, text):
        self.textProcessing(text)
        pos_sentences = []
@@ -270,6 +274,5 @@ class ComplexityLanguage():
            pos_sentences.append([w.get_tag() for w in ws])
        #print('POS',pos_sentences)
        self.pos_sentences = pos_sentences
-           
+
        return self.pos_sentences
-    
\ No newline at end of file
--- a/ComplexitySpanish.py
+++ b/ComplexitySpanish.py
@@ -17,7 +17,7 @@ class ComplexitySpanish(ComplexityLanguage):
        self.dep=freeling.dep_txala(self.DATA+self.lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol())
        
        # Para leer el texto que introducimos
-        CLASSDIR = "/home/sinai/Experiments/CLEF-PAN/"
+        CLASSDIR = "/home/garciacumbreras18/"
                
        f = open(CLASSDIR + 'CREA_total.txt')
        lines = f.readlines()
@@ -300,4 +300,4 @@ class ComplexitySpanish(ComplexityLanguage):
                metrics['CRAWFORD'] = self.yearsCrawford()
              
        return metrics 
- 
\ No newline at end of file
+ 
--- a/classify_comp
+++ b/classify_comp
-#!/usr/bin/python3
-#/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# Authors:
-# Rocío López-Anguita (rlanguit@ujaen.es)
-# Arturo Montejo-Ráez (amontejo@ujaen.es)
-# Centro de Estudios Avanzados en TIC (CEATIC)
-# Universidad de Jaén
-# 2018
-
-import json
-import os
-from ComplexityLanguage import ComplexityLanguage
-from ComplexitySpanish import ComplexitySpanish
-from ComplexityEnglish import ComplexityEnglish
-from ComplexityFrench import ComplexityFrench
-from ComplexityPolish import ComplexityPolish
-from ComplexityItalian import ComplexityItalian
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-from mpl_toolkits.mplot3d import Axes3D
-from sklearn.decomposition import PCA
-from sklearn import preprocessing
-import argparse
-
-parser = argparse.ArgumentParser(description='PAN2018 author identificator based on text complexity metrics')
-parser.add_argument('-i', type=string, help='input directory')
-parser.add_argument('-o', type=string, help='output directory')
-args = parser.parse_args()
-print(args.i, args.o)
-exit()
-print('Loading complexity analyzers for different languages...\n', flush=True)
-mlComplexityText = {
-    'en': ComplexityEnglish(),
-    'sp': ComplexitySpanish(),
-    'fr': ComplexityFrench(),
-    'pl': ComplexityPolish(),
-    'it': ComplexityItalian()
-}
-
-INPUT_DIR = args.i
-OUTPUT_DIR = args.o
-with open(INPUT_DIR+'/collection-info.json', 'r') as f:      
-    collectionInfo = json.load(f)  
-    f.close()
-print(type(collectionInfo))
\ No newline at end of file
--- a/classify_comp.py
+++ b/classify_comp.py
+#!/home/garciacumbreras18/anaconda3/bin/python3
+#/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###############################################################################
+# Authors:
+# Rocío López-Anguita (rlanguit@ujaen.es)
+# Arturo Montejo-Ráez (amontejo@ujaen.es)
+# Centro de Estudios Avanzados en TIC (CEATIC)
+#
+# Universidad de Jaén - 2018
+###############################################################################
+
+import json
+import os
+from ComplexityLanguage import ComplexityLanguage
+from ComplexitySpanish import ComplexitySpanish
+from ComplexityEnglish import ComplexityEnglish
+from ComplexityFrench import ComplexityFrench
+from ComplexityPolish import ComplexityPolish
+from ComplexityItalian import ComplexityItalian
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+from sklearn.decomposition import PCA
+from sklearn import preprocessing
+import argparse
+
+## ----------------------------------------------------------------------------
+##
+## Read command lines arguments
+##
+
+parser = argparse.ArgumentParser(description='PAN2018 author identificator based on text complexity metrics')
+parser.add_argument('-i', type=str, help='input directory')
+parser.add_argument('-o', type=str, help='output directory')
+args = parser.parse_args()
+INPUT_DIR, OUTPUT_DIR = args.i, args.o
+
+## ----------------------------------------------------------------------------
+##
+## Load of analyzers
+##
+
+print('Loading complexity analyzers for different languages...\n', flush=True)
+mlComplexityText = {
+    'en': ComplexityEnglish(),
+    'sp': ComplexitySpanish(),
+    'fr': ComplexityFrench(),
+    'pl': ComplexityPolish(),
+    'it': ComplexityItalian()
+}
+
+## ----------------------------------------------------------------------------
+##
+## Corpus loading (both, train and test data sets)
+##
+
+complexity_known = pd.DataFrame()
+complexity_unknown = pd.DataFrame()
+labels = {}
+labels_cand = []
+
+#
+# Recorremos todos los problemas
+#
+
+print('Loading collection-info.json file from', args.i, flush=True)
+with open(INPUT_DIR+'/collection-info.json', 'r') as f:      
+    collectionInfo = json.load(f)  
+    f.close()
+
+for problem in collectionInfo:
+    print('\n\nProblem: ', problem['problem-name'], flush=True)
+    print('Language: ', problem['language'], flush=True)
+    
+    #
+    # Cargamos la clase para el cálculo de la complejidad del idioma correspondiente
+    #
+    complexityText = mlComplexityText[problem['language']]
+    
+    #
+    # Recorremos todos los candidatos
+    #
+    print("Loading problem data...\n", flush=True)
+    with open(INPUT_DIR + '/' + problem['problem-name'] + '/problem-info.json', 'r') as problem_info_fhd:
+        problem_info= json.load(problem_info_fhd)
+        problem_info_fhd.close()
+    
+    #
+    # Leemos los textos de autoría conocida (TEXTOS DE ENTRENAMIENTO)
+    #
+    print("Loading training data")
+    for candidate in problem_info['candidate-authors']:
+        
+        print('Candidate: ', candidate['author-name'], flush=True)
+
+        files = os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], candidate['author-name']))
+        
+        probcand = problem['problem-name'] + candidate['author-name']
+        if not probcand in labels:
+            labels[probcand] = len(labels)
+            labels_cand += [probcand]
+        
+        #
+        # Procesamos todo los textos de ese candidato
+        #
+        for i, nameFile in enumerate(files):
+            print('Reading text file: ', nameFile, flush=True)
+
+            with open(os.path.join(os.path.join(INPUT_DIR,problem['problem-name'], candidate['author-name']), nameFile),'r') as context:
+                calcmetrics = complexityText.calcMetrics(context.read())
+                               
+                dfi = pd.DataFrame(calcmetrics, index=[i])
+                dfi['problem'] = problem['problem-name']
+                dfi['language'] = problem['language']
+                dfi['candidate'] = candidate['author-name']
+                dfi['label'] = labels[probcand]
+                dfi['filename'] = nameFile
+                complexity_known = complexity_known.append([dfi])   
+
+    #
+    # Si existe ground-truth, lo leemos para conocer los candidatos
+    #
+    unknown_candidates = False
+    if os.path.isfile(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json'):
+        print("Reading ground truth...", flush=True)
+        with open(INPUT_DIR +'/'+ problem['problem-name'] + '/ground-truth.json', 'r') as ground_truth_fhd:
+            ground_truth = json.load(ground_truth_fhd)
+            ground_truth_fhd.close()
+            unknown_candidates = {}
+            for item in ground_truth['ground_truth']:
+                unknown_candidates[item['unknown-text']] = item['true-author']
+    
+    #
+    # Recorremos archivos sin etiquetar (TEXTOS DE TEST)
+    #
+    print("Loading test data", flush=True)
+    
+    for i, unknown_file in enumerate(os.listdir(os.path.join(INPUT_DIR, problem['problem-name'], problem_info['unknown-folder']))):
+        print("Analyzing file", unknown_file, flush=True)
+        with open(INPUT_DIR + '/' + problem['problem-name'] + '/' + problem_info['unknown-folder'] + '/' + unknown_file, 'r') as unknown_fhd:
+            calcmetrics = complexityText.calcMetrics(unknown_fhd.read())
+            unknown_fhd.close()
+            
+            dfi = pd.DataFrame(calcmetrics, index=[i])
+            dfi['problem'] = problem['problem-name']
+            dfi['language'] = problem['language']
+            if unknown_candidates and unknown_candidates[unknown_file]:
+                probcand = problem['problem-name'] + unknown_candidates[unknown_file]                   
+                dfi['candidate'] = unknown_candidates[unknown_file]
+                dfi['label'] = labels[probcand]
+            
+            else:
+                dfi['candidate'] = None
+                dfi['label'] = None
+            dfi['filename'] = unknown_file
+            complexity_unknown = complexity_unknown.append([dfi])
+
+## ----------------------------------------------------------------------------
+##
+## Training and classification
+##
+
+if not os.path.exists(OUTPUT_DIR):
+    os.makedirs(OUTPUT_DIR)
+
+from sklearn import svm
+clf = svm.LinearSVC(C=1)
+for problem in set(complexity_known['problem']):
+
+    answers = []
+
+    print('------- Training and classifying ', problem, flush=True)
+    
+    #
+    # Para el train cogemos los textos conocidos
+    #
+    train = complexity_known.loc[complexity_known['problem'] == problem]
+    train = train.dropna(axis=1, how='any')
+    train_target = train['label']
+    train_data= train.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
+    train_data = pd.DataFrame(preprocessing.normalize(train_data, norm='l2'))
+
+    #
+    # Para el test cogemos los textos desconocidos
+    #
+    test = complexity_unknown.loc[complexity_unknown['problem'] == problem]
+    test = test.dropna(axis=1, how='any')
+    test_target = test['label']    
+    test_data = test.drop(['problem', 'language', 'candidate', 'filename', 'label'], axis=1)
+    test_data = pd.DataFrame(preprocessing.normalize(test_data, norm='l2'))
+
+    #Entrenamos con los textos con candidatos conocidos y predecimos con los datos desconocidos
+    y_pred = clf.fit(train_data, train_target).predict(test_data)
+    
+    for index, row in test.iterrows():
+        probcand = labels_cand[y_pred[index]]
+        answers.append({
+            'unknown-text': row['filename'],
+            'predicted-author': probcand[probcand.find("candidate"):],
+        })
+    
+    with open(OUTPUT_DIR + '/answers-' + problem +'.json', 'w') as file:
+        json.dump(answers, file, indent=4)
+