Tercera versión script

parent 82e1a3eb
Showing with 70 additions and 58 deletions
,fmplaza,SINAI-155-1,27.06.2018 17:01,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
......@@ -5,22 +5,22 @@ Created on 20 jun. 2018
'''
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
from keras.datasets.imdb import get_word_index
from model.glove_word_embedings import GloveWordEmbednigs
import pandas as pd
from nltk.tokenize.casual import TweetTokenizer
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dense, LSTM, Embedding, Bidirectional, Conv1D, GlobalAveragePooling1D, MaxPooling1D, Dropout, Activation, Flatten
from mpl_toolkits.axes_grid1.axes_size import Padded
from keras.utils import np_utils
from sklearn import metrics
from nltk.tokenize.casual import TweetTokenizer
from keras.preprocessing import sequence
import random
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
......@@ -31,8 +31,11 @@ EMBEDDING_DIM = 200
glove = GloveWordEmbednigs()
glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove.path_file = glove_file
#Load the Glove vectors file into memory, 2 index reserved
glove.load(2)
#Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
number_features = 100000
begin_ofset = 3
glove.load(number_features, begin_ofset)
#Load the WASSA corpus
......@@ -73,82 +76,89 @@ def tokenize(text):
def fit_transform_vocabulary(corpus):
#generate vocabulary of corpus
vocabulary = {}
#index 0: padding
#index 1: word not present in the embedding
#index 2: word magic (triggerword)
#corpus_indexes: index of each word of tweet in the embedding model
corpus_indexes = []
index = 1
for doc in corpus:
doc_indexes = []
tweet_indexes = []
tokens = tokenize(doc)
for token in tokens:
if token not in vocabulary:
vocabulary[token] = index
doc_indexes.append(index)
index += 1
if(token != "#triggerword"):
if(glove.is_word(token)):
word_index_embedding = glove.get_word_index(token)
tweet_indexes.append(word_index_embedding)
else:
index = 1
tweet_indexes.append(index)
else:
doc_indexes.append(vocabulary[token])
index = 2
tweet_indexes.append(index)
corpus_indexes.append(doc_indexes)
return (vocabulary, corpus_indexes)
corpus_indexes.append(tweet_indexes)
return corpus_indexes
def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, tweets_dev):
#Classification with RNN and embedings (pre-trained)
#calculate vocabulary
vocabulary_train, corpus_train_index = fit_transform_vocabulary(tweets_train)
vocabulary_size = len(vocabulary_train) + 1
max_len_input = int(np.average([len(tweet_train) for tweet_train in corpus_train_index], 0))
#calculate index vocabulary in corpus dev
corpus_dev_index = []
own_corpus_dev_index_append = corpus_dev_index.append
for tweet_dev in tweets_dev:
tokens_dev = tokenize(tweet_dev)
own_corpus_dev_index_append([vocabulary_train.get(token_dev, 0) for token_dev in tokens_dev])
print(type(own_corpus_dev_index_append))
# load pre-trained word embeddings into an Embedding layer
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, index in vocabulary_train.items():
word = word.lower()
if index > vocabulary_size - 1:
break
else:
embedding_vector = glove.word_indexes.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
#max_len_input = 30
train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_train_index[0][0]))
corpus_train_index = fit_transform_vocabulary(tweets_train)
corpus_dev_index = fit_transform_vocabulary(tweets_dev)
max_len_input = 40
train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
# define RNN model
model = Sequential()
#assign special index
trigger_word_vector = 2 * 0.1 * np.random.rand(EMBEDDING_DIM) - 1
glove.set_embedding_vector(1, trigger_word_vector)
vector_word_not_present = 2 * 0.1 * np.random.rand(EMBEDDING_DIM) - 1
glove.set_embedding_vector(2, vector_word_not_present)
#number of features in embeddings model
feature_size = number_features + 3
embedding_matrix = np.zeros((feature_size, EMBEDDING_DIM))
for word, idx in glove.word_indexes.items():
embedding_vec = glove.get_word_embedding(word)
if embedding_vec is not None and embedding_vec.shape[0]==EMBEDDING_DIM:
embedding_matrix[idx] = np.asarray(embedding_vec)
#input_length: Length of input sequences, when it is constant
e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
e = Embedding(feature_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
model.add(e)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model.add(LSTM(32))
#model.add(Dense(32, activation='tanh'))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
#model.add(Bidirectional(LSTM(2,dropout=0.2,recurrent_dropout=0.2,return_sequences=True)))
model.add(Dense(32, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(len(CLASSES), activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.add(Activation('sigmoid'))
# summarize the model
print(model.summary())
# fit the model
print("compilando modelo")
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print("entrenando")
model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=10, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, verbose=0)
print('Accuracy: %f' % (accuracy*100))
padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_dev_index[0][0]))
#prediction
tweets_dev_classified_labels = model.predict_classes(padded_docs_dev, batch_size=32, verbose=1)
return tweets_dev_classified_labels
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment