Primera aproximación programa RNN

dbb12602 · Flor Miriam Plaza del Arco · dbb12602 · dbb12602 · dbb12602 · dbb12602
Commit dbb12602 authored Jun 20, 2018 by Flor Miriam Plaza del Arco
Showing with 519 additions and 0 deletions
corpus/.~lock.train-v2.csv#
corpus/train-v2.csv
corpus/trial-v2.csv
corpus/trial-v2.labels
embeddings_RNN.py
model/__pycache__/abstract_word_embedding.cpython-35.pyc
model/__pycache__/glove_word_embedings.cpython-35.pyc
model/abstract_word_embedding.py
model/glove_word_embedings.py
--- a/corpus/.~lock.train-v2.csv#
+++ b/corpus/.~lock.train-v2.csv#
+,fmplaza,SINAI-155-1,18.06.2018 19:22,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
--- a/corpus/train-v2.csv
+++ b/corpus/train-v2.csv
--- a/corpus/trial-v2.csv
+++ b/corpus/trial-v2.csv
--- a/corpus/trial-v2.labels
+++ b/corpus/trial-v2.labels
--- a/embeddings_RNN.py
+++ b/embeddings_RNN.py
+'''
+Created on 20 jun. 2018
+
+@author: fmplaza
+'''
+
+from model.glove_word_embedings import GloveWordEmbednigs
+import pandas as pd
+from nltk.tokenize.casual import TweetTokenizer
+import numpy as np
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import Dense
+from keras.layers import LSTM
+from keras.layers import Embedding
+from mpl_toolkits.axes_grid1.axes_size import Padded
+
+TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
+CLASSES = []
+EMBEDDING_DIM = 200
+glove = GloveWordEmbednigs()
+glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
+glove.path_file = glove_file
+
+
+#Load the WASSA corpus
+def read_corpus():
+    
+    labels = []
+    classes_append = CLASSES.append
+    tweets_train_labels_numeric = []
+    
+    tweets_train = pd.read_csv('./corpus/train-v2.csv', sep="\t", header=0)
+    tweets_train_labels = tweets_train['emotion']
+    tweets_dev = pd.read_csv('./corpus/trial-v2.csv', sep="\t", header=0)
+    tweets_dev_labels = pd.read_csv('./corpus/trial-v2.labels', sep="\t", header=0)
+    
+    for label in tweets_train_labels.tolist():
+        if(label not in CLASSES):
+            classes_append(label)
+        tweets_train_labels_numeric.append(CLASSES.index(label))
+
+    return tweets_train.tweet, tweets_train_labels_numeric, tweets_dev.tweet, tweets_dev_labels
+
+#Classification with RNN and embedings (pre-trained)
+def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric):
+    
+    #load the whole embedding into memory, 2 index reserved
+    glove.load(2)
+    
+    #preprocessing tweets train
+    tokenizer = Tokenizer()
+    tokenizer.fit_on_texts(tweets_train)
+    
+    #calculate vocabulary
+    vocabulary_size = len(tokenizer.word_index) + 1
+    
+    # load pre-trained word embeddings into an Embedding layer
+    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
+    for word, index in tokenizer.word_index.items():
+        if index > vocabulary_size - 1:
+            break
+        else:
+            embedding_vector = glove.word_indexes.get(word)
+            if embedding_vector is not None:
+                embedding_matrix[index] = embedding_vector
+    
+    max_len_input = 20
+    
+     #integer encode the documents
+    encoded_docs = tokenizer.texts_to_sequences(tweets_train)
+    
+    #pad documents to a max length of n words
+    padded_docs = pad_sequences(encoded_docs, maxlen=max_len_input, padding='post')
+    
+    # define RNN model
+    model = Sequential()
+    e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, trainable=False)
+    model.add(e)
+    model.add(LSTM(32))
+    #model.add(Dense(32, activation='tanh'))
+    model.add(Dense(1, activation='softmax'))
+    # compile the model
+    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
+    # summarize the model
+    print(model.summary())
+    # fit the model
+    print(len(padded_docs))
+    print(len(tweets_train_labels_numeric))
+    model.fit(padded_docs, tweets_train_labels_numeric, epochs=50, verbose=0)
+    # evaluate the model
+    loss, accuracy = model.evaluate(padded_docs, tweets_train_labels_numeric, verbose=0)
+    print('Accuracy: %f' % (accuracy*100))
+                             
+def main ():
+
+    tweets_train, tweets_train_labels_numeric, tweets_dev, tweets_dev_labels = read_corpus()
+    classification_embedings_rnn(tweets_train, tweets_train_labels_numeric)
+
+
+if __name__ == '__main__':
+    main()
--- a/model/__pycache__/abstract_word_embedding.cpython-35.pyc
+++ b/model/__pycache__/abstract_word_embedding.cpython-35.pyc
--- a/model/__pycache__/glove_word_embedings.cpython-35.pyc
+++ b/model/__pycache__/glove_word_embedings.cpython-35.pyc
--- a/model/abstract_word_embedding.py
+++ b/model/abstract_word_embedding.py
+#!/usr/bin/python
+# *-* coding: utf-8 *-*
+
+'''
+Created on 2nd of June of 2016
+
+@author: Eugenio Martínez Cámara
+'''
+
+from abc import ABCMeta, abstractmethod
+
+class AbstractWordEmbedding(metaclass=ABCMeta):
+    '''Abstract class that defines the attributes and methods for word embedding
+    classes.
+    
+    This class represents and embedding file, so it must maintain and matrix 
+    (list of list) with the embedding vectors and index of words and their 
+    corresponding index in the matrix. It is very important to maintain the 
+    first row of the matrix, the index 0, as a empty list. The index 0 is 
+    reserved for the assignation of the default vector for unseen words, that 
+    it is dependent on the application.
+    
+    Attributes:
+        __word_index: A dictionary whose keys are the words of the embeddings file
+        and whose values are the indexes of the words in the matrix of embeddings.
+        The index goes from 1 to the size of the embeddings file. The index 
+        Zero is a reserved value.
+        
+        __word_embeddings: A matrix with whose rows are the embeddings linked to
+        each word.
+    
+    '''
+    
+    @property
+    @abstractmethod
+    def word_indexes(self):
+        """Getter method for the attribute word_indexes
+        
+        Returns:    
+            A dictionary whose keys are the words of the embeddings file and 
+            whose values are the vectors of embeddings.
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def word_embeddings(self):
+        """Getter method of the attribute word_embeddings
+        
+        Returns:
+            A dictionary whose keys are tokens and whose values are vectors of  
+            floats
+        """
+        ...
+
+   
+    @property
+    @abstractmethod
+    def path_file(self):
+        """Getter method for the attribute path_file
+        
+        Returns:
+            path_file: A string with the path of the file.
+        """
+        ...
+    
+    
+    @path_file.setter
+    @abstractmethod
+    def path_file(self, a_path_file):
+        """Setter method for the attribute path_file
+        
+        Args:
+            path_file: A string with the path of the file.
+        """
+        ...
+    
+    @property
+    @abstractmethod
+    def encoding(self):
+        """Getter method for the attribute encoding
+        
+        Returns:
+            The encoding of the word embeddings file
+        """
+        ...
+        
+        
+    @encoding.setter
+    @abstractmethod
+    def encoding(self, a_encoding):
+        """Setter method for the attribute encoding
+        
+        Args:
+            a_encoding: The encoding of the word embeddings file.
+        """
+        ...
+    
+    @abstractmethod
+    def is_word(self, word):
+        """Method to check if the word is the embeddings file.
+        
+        Returns:
+            True: If the word is the embeddings
+            False: If the word isn't in the embeddings
+        """
+        ...
+    
+    @abstractmethod    
+    def get_word_embedding(self, word):
+        """Returns the embeding of the word
+        
+        Args:
+            word: A String that is the word whose embedding vector will be  
+            retrieved.
+            
+        Returns:
+            If word is in the embeddings the vector of the word, otherwise None
+        """
+        ...
+    @abstractmethod
+    def clean(self):
+        """Clean the memory of the embeddings.
+        """
+    
+    @abstractmethod
+    def load(self, begin_ofset=None, vocaculary=None):
+        """Load the Glove vectors file into memory
+        
+        Args:
+            
+            begin_ofset: It is the number of initial positions that must to be 
+            assigned with a null value with the aim the specific application used
+            them for their own purposes.
+            
+            vocabulary: Optional argument. If None, all the file of word 
+            embedding is loaded, otherwise only the words in the set vocabulary 
+            are loaded
+        """
+        ...
+        
+        
+    @abstractmethod
+    def set_embedding_vector(self, index, vector):
+        """Set an embedding vector
+        
+        Args:
+            index: The index of the vector to add
+            vector: A list of values.
+        """
+        ...
+        
+    @abstractmethod
+    def size_embedding_vector(self):
+        """It returns the size of an embedding vector
+        
+        Returns:
+            An integer value which is the size of the embedding vectors.
+        """
+        ...
+        
+    @abstractmethod
+    def number_of_embedding_vectors(self):
+        """It returns the size of the embedding vector
+        
+        Returns:
+            An integer with the number of embedding vector.
+        """
+        ...
--- a/model/glove_word_embedings.py
+++ b/model/glove_word_embedings.py
+#!/usr/bin/python3
+# *-* coding: utf-8 *-*
+
+'''
+Created on 11th of May of 2016
+
+@author: Eugenio Martínez Cámara
+'''
+
+import logging
+from model.abstract_word_embedding import AbstractWordEmbedding
+from numpy import fromstring 
+
+class GloveWordEmbednigs(AbstractWordEmbedding):
+    '''Class for the management of Glove vectors
+    
+
+    Attributes:
+        __word_index: A dictionary whose keys are the words of the embeddings file
+        and whose values are the indexes of the words in the matrix of embeddings.
+        The index goes from 1 to the size of the embeddings file. The index 
+        Zero is a reserved value.
+        
+        __word_embeddings: A matrix with whose rows are the embeddings linked to
+        each word.
+        __max_number_of_vectors: The maximum number of vectors to load in memory
+    Constants:
+        GLOVE_SEPARATOR: The separator char in the Glove embeddings file
+    '''
+
+
+    def __init__(self):
+        '''
+        Constructor
+        '''
+        self.__word_indexes = {}
+        self.__word_embeddings = []
+        self.__path_file = ""
+        self.__GLOVE_SEPARATOR = " "
+        self.__encoding = "us-ascii"
+        
+        
+        
+    @property
+    def word_embeddings(self):
+        """Getter method of the attribute word_embeddings
+        
+        Returns:
+            A matrix (list of list) with the embeddings vectors. Each row is 
+            linked to a word of the attribute word_index
+        """
+        return(self.__word_embeddings)
+    
+    @property
+    def word_indexes(self):
+        """Getter method for the attribute word_indexes
+        
+        Returns:    
+            A dictionary whose keys are the words of the embedding file and 
+            whose values are the vectors of embedding.
+        """
+        return(self.__word_indexes)
+    
+    
+    @property
+    def path_file(self):
+        """Getter method for the attribute path_file
+        
+        Returns:
+            path_file: A string with the path of the file.
+        """
+        return(self.__path_file)
+    
+    
+    @path_file.setter
+    def path_file(self, a_path_file):
+        """Setter method for the attribute path_file
+        
+        Args:
+            path_file: A string with the path of the file.
+        """
+        self.__path_file = a_path_file
+    
+    
+    @property
+    def encoding(self):
+        """Getter method for the attribute encoding
+        """
+        return(self.__encoding)
+    
+    
+    
+    @encoding.setter
+    def encoding(self, a_encoding):
+        """Setter method for the attribute encoding
+        
+        Args:
+            a_encoding: A string value for the encoding of the word embedings file.
+        """
+        self.__encoding = a_encoding
+    
+    
+    def is_word(self, word):
+        """Method to check if the word is the embedding file.
+        
+        Args:
+            word: A string that is the word to check if it is in the embeddign.
+        
+        Returns:
+            True: If the word is the embedding
+            False: If the word isn't in the embedding
+        """
+        return( word in self.__word_indexes )
+    
+    def get_word_index(self, word):
+        """Returns the index of the word
+        
+        Args:
+            word: A String that ist the word whose index is going to be 
+            retrieved.
+        
+        Returns:
+            If the word is in the embedding, the index of the word. Otherwise, 
+            None is returned
+        """
+        return(self.__word_indexes.get(word, None))
+    
+    def get_word_embedding(self, word):
+        """Returns the embedding of the word
+        
+        Args:
+            word: A String that is the word whose embedding vector will be  
+            retrieved.
+            
+        Returns:
+            If word is in the embedding the vector of the word, otherwise None
+        """
+        return_value = None
+        word_index = self.__word_indexes.get(word, None)
+        if(word_index is not None):
+            return_value = self.__word_embeddings[word_index]
+        
+        return(return_value)
+    
+    
+    def clean(self):
+        """Clean the memory of the embedding.
+        """
+        self.__word_embeddings.clear()
+    
+    
+    def __load_full(self, index):
+        """Load the full file of word embeddings
+        
+        Args:
+            index: The index of the word embeddings main vector
+        """
+        own_partition = str.partition
+        own_strip = str.strip        
+        with open(self.__path_file, "r", encoding="utf-8") as glove_file:
+            for line in glove_file:
+                glove_fields = own_partition(line, self.__GLOVE_SEPARATOR)
+                glove_vector = fromstring(own_strip(glove_fields[2]), dtype=float, sep=" ")
+                self.__word_indexes[own_strip(glove_fields[0])] = index
+                self.__word_embeddings.append(glove_vector)
+                index += 1
+    
+    def __load_only_vocabulary(self, index, vocabulary):
+        """Load only the words of the vocabulary
+        
+        Args:
+            index: The index of the word embeddings main vector
+            
+            vocabulary: the set of words to be only loaded.
+        """
+        own_partition = str.partition
+        own_strip = str.strip
+        
+        with open(self.__path_file, "r", encoding="utf-8") as glove_file:
+            for line in glove_file:
+                glove_fields = own_partition(line, self.__GLOVE_SEPARATOR)
+                glove_vector = fromstring(own_strip(glove_fields[2]), dtype=float, sep=" ")
+                word = own_strip(glove_fields[0])
+                if(word in vocabulary):
+                    self.__word_indexes[word] = index
+                    self.__word_embeddings.append(glove_vector)
+                    index += 1
+                    
+                    
+    def load(self, begin_ofset=None, vocabulary=None):
+        """Load the Glove vectors file into memory
+        
+        Args:
+            path_input_file: A string value with the path to the word embedding
+            file.
+            
+            begin_ofset: It is the number of initial positions that must to be 
+            assigned with an empty list value with the aim the specific application used
+            them for their own purposes.
+            
+            
+            vocabulary: Optional argument. If None, all the file of word 
+            embedding is loaded, otherwise only the words in the set vocabulary 
+            are loaded
+        """
+        
+        logging.debug("GLOVE_WORD_EMBEDDINGS: Start of the loading of word embeddings")
+        for times in range(begin_ofset):
+            self.__word_embeddings.append([]) #Index 0 is reserved
+        
+        if(vocabulary is None):
+            self.__load_full(begin_ofset)
+        else:
+            self.__load_only_vocabulary(begin_ofset, vocabulary)
+        
+        
+        logging.debug("GLOVE_WORD_EMBEDDINGS: End of the loading of word embeddings: " + str(len(self.__word_embeddings)))
+    
+    def set_embedding_vector(self, index, vector):
+        """Set an embedding vector
+        
+        Args:
+            index: The index of the vector to add
+            vector: A list of values.
+        """
+        
+        self.__word_embeddings[index] = vector
+        
+        
+    def size_embedding_vector(self):
+        """It returns the size of an embedding vector
+        
+        Returns:
+            An integer value which is the size of the embedding vectors.
+        """
+        return(len(self.__word_embeddings[-1]))
+    
+    
+    def number_of_embedding_vectors(self):
+        """It returns the size of the embedding vector
+        
+        Returns:
+            An integer with the number of embedding vector.
+        """
+        return(len(self.__word_embeddings))