Segunda aproximación programa RNN

7373f811 · Flor Miriam Plaza del Arco · dbb12602 · dbb12602 · 7373f811 · 7373f811
Commit 7373f811 authored Jun 21, 2018 by Flor Miriam Plaza del Arco
Showing with 163 additions and 28 deletions
corpus/.~lock.train-v2.csv#
embeddings_RNN.py
position_trigger_word.py
--- a/corpus/.~lock.train-v2.csv#
+++ b/corpus/.~lock.train-v2.csv#
-,fmplaza,SINAI-155-1,18.06.2018 19:22,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
--- a/embeddings_RNN.py
+++ b/embeddings_RNN.py
@@ -4,6 +4,8 @@ Created on 20 jun. 2018
 @author: fmplaza
 '''

+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 from model.glove_word_embedings import GloveWordEmbednigs
 import pandas as pd
 from nltk.tokenize.casual import TweetTokenizer
@@ -15,88 +17,163 @@ from keras.layers import Dense
 from keras.layers import LSTM
 from keras.layers import Embedding
 from mpl_toolkits.axes_grid1.axes_size import Padded
+from keras.utils import np_utils
+from sklearn import metrics
+from nltk.tokenize.casual import TweetTokenizer
+from keras.preprocessing import sequence
+

 TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
 CLASSES = []
 EMBEDDING_DIM = 200
+
+#load GloVe model
 glove = GloveWordEmbednigs()
 glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
 glove.path_file = glove_file
+#Load the Glove vectors file into memory, 2 index reserved
+glove.load(2)


 #Load the WASSA corpus
 def read_corpus():
    
-    labels = []
    classes_append = CLASSES.append
    tweets_train_labels_numeric = []
+    tweets_dev_labels_numeric = []
    
    tweets_train = pd.read_csv('./corpus/train-v2.csv', sep="\t", header=0)
    tweets_train_labels = tweets_train['emotion']
    tweets_dev = pd.read_csv('./corpus/trial-v2.csv', sep="\t", header=0)
    tweets_dev_labels = pd.read_csv('./corpus/trial-v2.labels', sep="\t", header=0)
+    tweets_dev_labels = tweets_dev_labels['emotion']
    
+    
+    #convert categorical labels into numeric labels
    for label in tweets_train_labels.tolist():
        if(label not in CLASSES):
            classes_append(label)
        tweets_train_labels_numeric.append(CLASSES.index(label))
+                
+    for label in tweets_dev_labels.tolist():
+        tweets_dev_labels_numeric.append(CLASSES.index(label))       
+                
+    tweets_train_labels_numeric = np_utils.to_categorical(tweets_train_labels_numeric)
+
+    return tweets_train.tweet, tweets_train_labels_numeric, tweets_dev.tweet, tweets_dev_labels_numeric
+
+def tokenize(text):
+    #preprocessing data
+
+    text_tokenized = TWEET_TOKENIZER.tokenize(text)
+    
+    return text_tokenized

-    return tweets_train.tweet, tweets_train_labels_numeric, tweets_dev.tweet, tweets_dev_labels

-#Classification with RNN and embedings (pre-trained)
-def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric):
+def fit_transform_vocabulary(corpus):
+    #generate vocabulary of corpus
    
-    #load the whole embedding into memory, 2 index reserved
-    glove.load(2)
+    vocabulary = {}
+    corpus_indexes = []
+    index = 1
+    for doc in corpus:
+        doc_indexes = []
+        tokens = tokenize(doc)
+        for token in tokens:
+            if token not in vocabulary:
+                vocabulary[token] = index
+                doc_indexes.append(index)
+                index += 1
+            else:
+                doc_indexes.append(vocabulary[token])
+                
+        corpus_indexes.append(doc_indexes)
+    return (vocabulary, corpus_indexes)
+
+
+def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, tweets_dev):
+    #Classification with RNN and embedings (pre-trained) 
+        
+    #calculate vocabulary    
+    vocabulary_train, corpus_train_index = fit_transform_vocabulary(tweets_train)
+    vocabulary_size = len(vocabulary_train) + 1
+
+    max_len_input = int(np.average([len(tweet_train) for tweet_train in corpus_train_index], 0))
    
-    #preprocessing tweets train
-    tokenizer = Tokenizer()
-    tokenizer.fit_on_texts(tweets_train)
+    #calculate index vocabulary in corpus dev
+    corpus_dev_index = []
+    own_corpus_dev_index_append = corpus_dev_index.append
+    for tweet_dev in tweets_dev:
+        tokens_dev = tokenize(tweet_dev)
+        own_corpus_dev_index_append([vocabulary_train.get(token_dev, 0) for token_dev in tokens_dev])
    
-    #calculate vocabulary
-    vocabulary_size = len(tokenizer.word_index) + 1
+    print(type(own_corpus_dev_index_append))
    
    # load pre-trained word embeddings into an Embedding layer
    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
-    for word, index in tokenizer.word_index.items():
+    for word, index in vocabulary_train.items():
+        word = word.lower()
        if index > vocabulary_size - 1:
            break
        else:
            embedding_vector = glove.word_indexes.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
+
+    #max_len_input = 30
    
-    max_len_input = 20
-    
-     #integer encode the documents
-    encoded_docs = tokenizer.texts_to_sequences(tweets_train)
+    train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_train_index[0][0]))
    
-    #pad documents to a max length of n words
-    padded_docs = pad_sequences(encoded_docs, maxlen=max_len_input, padding='post')
    
    # define RNN model
    model = Sequential()
-    e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, trainable=False)
+    
+    #input_length:  Length of input sequences, when it is constant
+    e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
    model.add(e)
+    
+    #number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
    model.add(LSTM(32))
+    
    #model.add(Dense(32, activation='tanh'))
-    model.add(Dense(1, activation='softmax'))
+    model.add(Dense(len(CLASSES), activation='softmax'))
+    
    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
+    
    # summarize the model
    print(model.summary())
+    
    # fit the model
-    print(len(padded_docs))
-    print(len(tweets_train_labels_numeric))
-    model.fit(padded_docs, tweets_train_labels_numeric, epochs=50, verbose=0)
+    model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=10, verbose=0)
+    
    # evaluate the model
-    loss, accuracy = model.evaluate(padded_docs, tweets_train_labels_numeric, verbose=0)
+    loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, verbose=0)
    print('Accuracy: %f' % (accuracy*100))
-                             
+    padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_dev_index[0][0]))
+    tweets_dev_classified_labels = model.predict_classes(padded_docs_dev, batch_size=32, verbose=1)
+    
+    return tweets_dev_classified_labels
+       
+def calculate_quality_performamnce(y_labels, y_classified_labels, model_name):
+    
+    classes_index = [CLASSES.index(c) for c in CLASSES]
+    accruacy = metrics.accuracy_score(y_labels, y_classified_labels)
+    macro_precision = metrics.precision_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
+    macro_recall = metrics.recall_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
+    macro_f1 = metrics.f1_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
+    
+    print("\n*** Results " + model_name + " ***")
+    print("Macro-Precision: " + str(macro_precision))
+    print("Macro-Recall: " + str(macro_recall))
+    print("Macro-F1: " + str(macro_f1))
+    print("Accuracy: " + str(accruacy))
+                          
 def main ():

    tweets_train, tweets_train_labels_numeric, tweets_dev, tweets_dev_labels = read_corpus()
-    classification_embedings_rnn(tweets_train, tweets_train_labels_numeric)
+    tweets_dev_classified_labels =  classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, tweets_dev)
+    calculate_quality_performamnce(tweets_dev_labels, tweets_dev_classified_labels, "RNN_LSTM")


 if __name__ == '__main__':

--- a/position_trigger_word.py
+++ b/position_trigger_word.py
+'''
+Created on 20 jun. 2018
+
+@author: fmplaza
+'''
+
+import pandas as pd
+from nltk.tokenize import TweetTokenizer
+import statistics as s
+
+tknzr = TweetTokenizer()
+
+
+def read_corpus():
+    
+    labels = []
+    tweets_train_labels_numeric = []
+    
+    tweets_train = pd.read_csv('./corpus/train-v2.csv', sep="\t", header=0)
+    tweets_train_labels = tweets_train['emotion']
+    tweets_dev = pd.read_csv('./corpus/trial-v2.csv', sep="\t", header=0)
+    tweets_dev_labels = pd.read_csv('./corpus/trial-v2.labels', sep="\t", header=0)
+    
+
+    return tweets_train, tweets_train_labels, tweets_dev, tweets_dev_labels
+
+def calculate_position(tweets_train):
+    
+    
+    position_triggerword = []
+    len_tweet = []
+    cont = 0
+    for index, row in tweets_train.iterrows():
+        text = tknzr.tokenize(row['tweet'])
+        if '#TRIGGERWORD' in text:
+            position = text.index('#TRIGGERWORD')
+            position_triggerword.append(position)
+            len_tweet.append(len(text))
+        if(len(text) == 155):
+            cont = cont + 1
+    
+    print("Position trigger word")
+    print("Max position: ", max(position_triggerword))
+    print("Mean position" , s.mean(position_triggerword))
+    print("Mode position", s.mode(position_triggerword))
+    
+    print("Lenght tweet")
+    print("Mean lenght", s.mean(len_tweet))
+    print("Mode lenght", s.mode(len_tweet))
+            
+        
+
+def main():
+    tweets_train, tweets_train_labels, tweets_dev, tweets_dev_labels = read_corpus()
+    calculate_position(tweets_train)
+
+if __name__ == '__main__':
+    main()
+    pass
\ No newline at end of file