Primera aproximación programa RNN

parents
,fmplaza,SINAI-155-1,18.06.2018 19:22,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
'''
Created on 20 jun. 2018
@author: fmplaza
'''
from model.glove_word_embedings import GloveWordEmbednigs
import pandas as pd
from nltk.tokenize.casual import TweetTokenizer
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from mpl_toolkits.axes_grid1.axes_size import Padded
TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
CLASSES = []
EMBEDDING_DIM = 200
glove = GloveWordEmbednigs()
glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove.path_file = glove_file
#Load the WASSA corpus
def read_corpus():
labels = []
classes_append = CLASSES.append
tweets_train_labels_numeric = []
tweets_train = pd.read_csv('./corpus/train-v2.csv', sep="\t", header=0)
tweets_train_labels = tweets_train['emotion']
tweets_dev = pd.read_csv('./corpus/trial-v2.csv', sep="\t", header=0)
tweets_dev_labels = pd.read_csv('./corpus/trial-v2.labels', sep="\t", header=0)
for label in tweets_train_labels.tolist():
if(label not in CLASSES):
classes_append(label)
tweets_train_labels_numeric.append(CLASSES.index(label))
return tweets_train.tweet, tweets_train_labels_numeric, tweets_dev.tweet, tweets_dev_labels
#Classification with RNN and embedings (pre-trained)
def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric):
#load the whole embedding into memory, 2 index reserved
glove.load(2)
#preprocessing tweets train
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets_train)
#calculate vocabulary
vocabulary_size = len(tokenizer.word_index) + 1
# load pre-trained word embeddings into an Embedding layer
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, index in tokenizer.word_index.items():
if index > vocabulary_size - 1:
break
else:
embedding_vector = glove.word_indexes.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
max_len_input = 20
#integer encode the documents
encoded_docs = tokenizer.texts_to_sequences(tweets_train)
#pad documents to a max length of n words
padded_docs = pad_sequences(encoded_docs, maxlen=max_len_input, padding='post')
# define RNN model
model = Sequential()
e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, trainable=False)
model.add(e)
model.add(LSTM(32))
#model.add(Dense(32, activation='tanh'))
model.add(Dense(1, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
print(len(padded_docs))
print(len(tweets_train_labels_numeric))
model.fit(padded_docs, tweets_train_labels_numeric, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, tweets_train_labels_numeric, verbose=0)
print('Accuracy: %f' % (accuracy*100))
def main ():
tweets_train, tweets_train_labels_numeric, tweets_dev, tweets_dev_labels = read_corpus()
classification_embedings_rnn(tweets_train, tweets_train_labels_numeric)
if __name__ == '__main__':
main()
#!/usr/bin/python
# *-* coding: utf-8 *-*
'''
Created on 2nd of June of 2016
@author: Eugenio Martínez Cámara
'''
from abc import ABCMeta, abstractmethod
class AbstractWordEmbedding(metaclass=ABCMeta):
'''Abstract class that defines the attributes and methods for word embedding
classes.
This class represents and embedding file, so it must maintain and matrix
(list of list) with the embedding vectors and index of words and their
corresponding index in the matrix. It is very important to maintain the
first row of the matrix, the index 0, as a empty list. The index 0 is
reserved for the assignation of the default vector for unseen words, that
it is dependent on the application.
Attributes:
__word_index: A dictionary whose keys are the words of the embeddings file
and whose values are the indexes of the words in the matrix of embeddings.
The index goes from 1 to the size of the embeddings file. The index
Zero is a reserved value.
__word_embeddings: A matrix with whose rows are the embeddings linked to
each word.
'''
@property
@abstractmethod
def word_indexes(self):
"""Getter method for the attribute word_indexes
Returns:
A dictionary whose keys are the words of the embeddings file and
whose values are the vectors of embeddings.
"""
...
@property
@abstractmethod
def word_embeddings(self):
"""Getter method of the attribute word_embeddings
Returns:
A dictionary whose keys are tokens and whose values are vectors of
floats
"""
...
@property
@abstractmethod
def path_file(self):
"""Getter method for the attribute path_file
Returns:
path_file: A string with the path of the file.
"""
...
@path_file.setter
@abstractmethod
def path_file(self, a_path_file):
"""Setter method for the attribute path_file
Args:
path_file: A string with the path of the file.
"""
...
@property
@abstractmethod
def encoding(self):
"""Getter method for the attribute encoding
Returns:
The encoding of the word embeddings file
"""
...
@encoding.setter
@abstractmethod
def encoding(self, a_encoding):
"""Setter method for the attribute encoding
Args:
a_encoding: The encoding of the word embeddings file.
"""
...
@abstractmethod
def is_word(self, word):
"""Method to check if the word is the embeddings file.
Returns:
True: If the word is the embeddings
False: If the word isn't in the embeddings
"""
...
@abstractmethod
def get_word_embedding(self, word):
"""Returns the embeding of the word
Args:
word: A String that is the word whose embedding vector will be
retrieved.
Returns:
If word is in the embeddings the vector of the word, otherwise None
"""
...
@abstractmethod
def clean(self):
"""Clean the memory of the embeddings.
"""
@abstractmethod
def load(self, begin_ofset=None, vocaculary=None):
"""Load the Glove vectors file into memory
Args:
begin_ofset: It is the number of initial positions that must to be
assigned with a null value with the aim the specific application used
them for their own purposes.
vocabulary: Optional argument. If None, all the file of word
embedding is loaded, otherwise only the words in the set vocabulary
are loaded
"""
...
@abstractmethod
def set_embedding_vector(self, index, vector):
"""Set an embedding vector
Args:
index: The index of the vector to add
vector: A list of values.
"""
...
@abstractmethod
def size_embedding_vector(self):
"""It returns the size of an embedding vector
Returns:
An integer value which is the size of the embedding vectors.
"""
...
@abstractmethod
def number_of_embedding_vectors(self):
"""It returns the size of the embedding vector
Returns:
An integer with the number of embedding vector.
"""
...
#!/usr/bin/python3
# *-* coding: utf-8 *-*
'''
Created on 11th of May of 2016
@author: Eugenio Martínez Cámara
'''
import logging
from model.abstract_word_embedding import AbstractWordEmbedding
from numpy import fromstring
class GloveWordEmbednigs(AbstractWordEmbedding):
'''Class for the management of Glove vectors
Attributes:
__word_index: A dictionary whose keys are the words of the embeddings file
and whose values are the indexes of the words in the matrix of embeddings.
The index goes from 1 to the size of the embeddings file. The index
Zero is a reserved value.
__word_embeddings: A matrix with whose rows are the embeddings linked to
each word.
__max_number_of_vectors: The maximum number of vectors to load in memory
Constants:
GLOVE_SEPARATOR: The separator char in the Glove embeddings file
'''
def __init__(self):
'''
Constructor
'''
self.__word_indexes = {}
self.__word_embeddings = []
self.__path_file = ""
self.__GLOVE_SEPARATOR = " "
self.__encoding = "us-ascii"
@property
def word_embeddings(self):
"""Getter method of the attribute word_embeddings
Returns:
A matrix (list of list) with the embeddings vectors. Each row is
linked to a word of the attribute word_index
"""
return(self.__word_embeddings)
@property
def word_indexes(self):
"""Getter method for the attribute word_indexes
Returns:
A dictionary whose keys are the words of the embedding file and
whose values are the vectors of embedding.
"""
return(self.__word_indexes)
@property
def path_file(self):
"""Getter method for the attribute path_file
Returns:
path_file: A string with the path of the file.
"""
return(self.__path_file)
@path_file.setter
def path_file(self, a_path_file):
"""Setter method for the attribute path_file
Args:
path_file: A string with the path of the file.
"""
self.__path_file = a_path_file
@property
def encoding(self):
"""Getter method for the attribute encoding
"""
return(self.__encoding)
@encoding.setter
def encoding(self, a_encoding):
"""Setter method for the attribute encoding
Args:
a_encoding: A string value for the encoding of the word embedings file.
"""
self.__encoding = a_encoding
def is_word(self, word):
"""Method to check if the word is the embedding file.
Args:
word: A string that is the word to check if it is in the embeddign.
Returns:
True: If the word is the embedding
False: If the word isn't in the embedding
"""
return( word in self.__word_indexes )
def get_word_index(self, word):
"""Returns the index of the word
Args:
word: A String that ist the word whose index is going to be
retrieved.
Returns:
If the word is in the embedding, the index of the word. Otherwise,
None is returned
"""
return(self.__word_indexes.get(word, None))
def get_word_embedding(self, word):
"""Returns the embedding of the word
Args:
word: A String that is the word whose embedding vector will be
retrieved.
Returns:
If word is in the embedding the vector of the word, otherwise None
"""
return_value = None
word_index = self.__word_indexes.get(word, None)
if(word_index is not None):
return_value = self.__word_embeddings[word_index]
return(return_value)
def clean(self):
"""Clean the memory of the embedding.
"""
self.__word_embeddings.clear()
def __load_full(self, index):
"""Load the full file of word embeddings
Args:
index: The index of the word embeddings main vector
"""
own_partition = str.partition
own_strip = str.strip
with open(self.__path_file, "r", encoding="utf-8") as glove_file:
for line in glove_file:
glove_fields = own_partition(line, self.__GLOVE_SEPARATOR)
glove_vector = fromstring(own_strip(glove_fields[2]), dtype=float, sep=" ")
self.__word_indexes[own_strip(glove_fields[0])] = index
self.__word_embeddings.append(glove_vector)
index += 1
def __load_only_vocabulary(self, index, vocabulary):
"""Load only the words of the vocabulary
Args:
index: The index of the word embeddings main vector
vocabulary: the set of words to be only loaded.
"""
own_partition = str.partition
own_strip = str.strip
with open(self.__path_file, "r", encoding="utf-8") as glove_file:
for line in glove_file:
glove_fields = own_partition(line, self.__GLOVE_SEPARATOR)
glove_vector = fromstring(own_strip(glove_fields[2]), dtype=float, sep=" ")
word = own_strip(glove_fields[0])
if(word in vocabulary):
self.__word_indexes[word] = index
self.__word_embeddings.append(glove_vector)
index += 1
def load(self, begin_ofset=None, vocabulary=None):
"""Load the Glove vectors file into memory
Args:
path_input_file: A string value with the path to the word embedding
file.
begin_ofset: It is the number of initial positions that must to be
assigned with an empty list value with the aim the specific application used
them for their own purposes.
vocabulary: Optional argument. If None, all the file of word
embedding is loaded, otherwise only the words in the set vocabulary
are loaded
"""
logging.debug("GLOVE_WORD_EMBEDDINGS: Start of the loading of word embeddings")
for times in range(begin_ofset):
self.__word_embeddings.append([]) #Index 0 is reserved
if(vocabulary is None):
self.__load_full(begin_ofset)
else:
self.__load_only_vocabulary(begin_ofset, vocabulary)
logging.debug("GLOVE_WORD_EMBEDDINGS: End of the loading of word embeddings: " + str(len(self.__word_embeddings)))
def set_embedding_vector(self, index, vector):
"""Set an embedding vector
Args:
index: The index of the vector to add
vector: A list of values.
"""
self.__word_embeddings[index] = vector
def size_embedding_vector(self):
"""It returns the size of an embedding vector
Returns:
An integer value which is the size of the embedding vectors.
"""
return(len(self.__word_embeddings[-1]))
def number_of_embedding_vectors(self):
"""It returns the size of the embedding vector
Returns:
An integer with the number of embedding vector.
"""
return(len(self.__word_embeddings))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment