Tercera versión script

f80fe0a3 · Flor Miriam Plaza del Arco · 82e1a3eb · f80fe0a3 · f80fe0a3
Commit f80fe0a3 authored Jun 28, 2018 by Flor Miriam Plaza del Arco
Showing with 70 additions and 58 deletions
corpus/.~lock.train-v2.csv#
embeddings_RNN.py
--- a/corpus/.~lock.train-v2.csv#
+++ b/corpus/.~lock.train-v2.csv#
+,fmplaza,SINAI-155-1,27.06.2018 17:01,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
--- a/embeddings_RNN.py
+++ b/embeddings_RNN.py
@@ -5,22 +5,22 @@ Created on 20 jun. 2018
 '''

 import os
-os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
+from keras.datasets.imdb import get_word_index
 from model.glove_word_embedings import GloveWordEmbednigs
 import pandas as pd
 from nltk.tokenize.casual import TweetTokenizer
 import numpy as np
-from keras.preprocessing.text import Tokenizer
 from keras.preprocessing.sequence import pad_sequences
 from keras.models import Sequential
-from keras.layers import Dense
-from keras.layers import LSTM
-from keras.layers import Embedding
+from keras.layers import Dense, LSTM, Embedding, Bidirectional, Conv1D, GlobalAveragePooling1D, MaxPooling1D, Dropout, Activation, Flatten
 from mpl_toolkits.axes_grid1.axes_size import Padded
 from keras.utils import np_utils
 from sklearn import metrics
 from nltk.tokenize.casual import TweetTokenizer
 from keras.preprocessing import sequence
+import random
+
+os.environ['TF_CPP_MIN_LOG_LEVEL']='2'


 TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
@@ -31,8 +31,11 @@ EMBEDDING_DIM = 200
 glove = GloveWordEmbednigs()
 glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
 glove.path_file = glove_file
-#Load the Glove vectors file into memory, 2 index reserved
-glove.load(2)
+
+#Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
+number_features = 100000
+begin_ofset = 3
+glove.load(number_features, begin_ofset)


 #Load the WASSA corpus
@@ -73,82 +76,89 @@ def tokenize(text):
 def fit_transform_vocabulary(corpus):
    #generate vocabulary of corpus
    
-    vocabulary = {}
+    #index 0: padding
+    #index 1: word not present in the embedding
+    #index 2: word magic (triggerword)
+    
+    #corpus_indexes: index of each word of tweet in the embedding model
+      
    corpus_indexes = []
-    index = 1
    for doc in corpus:
-        doc_indexes = []
+        tweet_indexes = []
        tokens = tokenize(doc)
        for token in tokens:
-            if token not in vocabulary:
-                vocabulary[token] = index
-                doc_indexes.append(index)
-                index += 1
+            if(token != "#triggerword"):
+                if(glove.is_word(token)):
+                    word_index_embedding = glove.get_word_index(token)
+                    tweet_indexes.append(word_index_embedding)
+                else:
+                    index = 1
+                    tweet_indexes.append(index)
            else:
-                doc_indexes.append(vocabulary[token])
+                index = 2
+                tweet_indexes.append(index)
+
                
-        corpus_indexes.append(doc_indexes)
-    return (vocabulary, corpus_indexes)
+        corpus_indexes.append(tweet_indexes)
+    
+    return corpus_indexes


 def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, tweets_dev):
    #Classification with RNN and embedings (pre-trained) 
        
    #calculate vocabulary    
-    vocabulary_train, corpus_train_index = fit_transform_vocabulary(tweets_train)
-    vocabulary_size = len(vocabulary_train) + 1
-
-    max_len_input = int(np.average([len(tweet_train) for tweet_train in corpus_train_index], 0))
-    
-    #calculate index vocabulary in corpus dev
-    corpus_dev_index = []
-    own_corpus_dev_index_append = corpus_dev_index.append
-    for tweet_dev in tweets_dev:
-        tokens_dev = tokenize(tweet_dev)
-        own_corpus_dev_index_append([vocabulary_train.get(token_dev, 0) for token_dev in tokens_dev])
-    
-    print(type(own_corpus_dev_index_append))
-    
-    # load pre-trained word embeddings into an Embedding layer
-    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
-    for word, index in vocabulary_train.items():
-        word = word.lower()
-        if index > vocabulary_size - 1:
-            break
-        else:
-            embedding_vector = glove.word_indexes.get(word)
-            if embedding_vector is not None:
-                embedding_matrix[index] = embedding_vector
-
-    #max_len_input = 30
-    train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_train_index[0][0]))
-    
+    corpus_train_index = fit_transform_vocabulary(tweets_train)
+    corpus_dev_index = fit_transform_vocabulary(tweets_dev)
+
+    max_len_input = 40
+               
+    train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
+    padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
+
    # define RNN model
    model = Sequential()
    
+    #assign special index
+    trigger_word_vector = 2 * 0.1 * np.random.rand(EMBEDDING_DIM) - 1
+    glove.set_embedding_vector(1, trigger_word_vector)
+    
+    vector_word_not_present = 2 * 0.1 * np.random.rand(EMBEDDING_DIM) - 1
+    glove.set_embedding_vector(2, vector_word_not_present)
+    
+            
+    #number of features in embeddings model 
+    feature_size = number_features + 3
+    embedding_matrix = np.zeros((feature_size, EMBEDDING_DIM))
+    for word, idx in glove.word_indexes.items():
+        embedding_vec = glove.get_word_embedding(word)
+        if embedding_vec is not None and embedding_vec.shape[0]==EMBEDDING_DIM:
+            embedding_matrix[idx] = np.asarray(embedding_vec)
+    
+    
    #input_length:  Length of input sequences, when it is constant
-    e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
+    e = Embedding(feature_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
    model.add(e)
-    
    #number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
-    model.add(LSTM(32))
-    
-    #model.add(Dense(32, activation='tanh'))
+    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
+    #model.add(Bidirectional(LSTM(2,dropout=0.2,recurrent_dropout=0.2,return_sequences=True)))
+    model.add(Dense(32, activation='tanh'))
+    model.add(Dropout(0.5))
    model.add(Dense(len(CLASSES), activation='softmax'))
-    
-    # compile the model
-    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
+    model.add(Activation('sigmoid'))
    
    # summarize the model
    print(model.summary())
-    
-    # fit the model
+
+    print("compilando modelo")
+    # compile the model
+    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
+    print("entrenando")
    model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=10, verbose=0)
-    
-    # evaluate the model
    loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, verbose=0)
    print('Accuracy: %f' % (accuracy*100))
-    padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_dev_index[0][0]))
+    
+    #prediction
    tweets_dev_classified_labels = model.predict_classes(padded_docs_dev, batch_size=32, verbose=1)
    
    return tweets_dev_classified_labels