Tercera versión script

parent 82e1a3eb
Showing with 70 additions and 58 deletions
,fmplaza,SINAI-155-1,27.06.2018 17:01,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
...@@ -5,22 +5,22 @@ Created on 20 jun. 2018 ...@@ -5,22 +5,22 @@ Created on 20 jun. 2018
''' '''
import os import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2' from keras.datasets.imdb import get_word_index
from model.glove_word_embedings import GloveWordEmbednigs from model.glove_word_embedings import GloveWordEmbednigs
import pandas as pd import pandas as pd
from nltk.tokenize.casual import TweetTokenizer from nltk.tokenize.casual import TweetTokenizer
import numpy as np import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential from keras.models import Sequential
from keras.layers import Dense from keras.layers import Dense, LSTM, Embedding, Bidirectional, Conv1D, GlobalAveragePooling1D, MaxPooling1D, Dropout, Activation, Flatten
from keras.layers import LSTM
from keras.layers import Embedding
from mpl_toolkits.axes_grid1.axes_size import Padded from mpl_toolkits.axes_grid1.axes_size import Padded
from keras.utils import np_utils from keras.utils import np_utils
from sklearn import metrics from sklearn import metrics
from nltk.tokenize.casual import TweetTokenizer from nltk.tokenize.casual import TweetTokenizer
from keras.preprocessing import sequence from keras.preprocessing import sequence
import random
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False) TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
...@@ -31,8 +31,11 @@ EMBEDDING_DIM = 200 ...@@ -31,8 +31,11 @@ EMBEDDING_DIM = 200
glove = GloveWordEmbednigs() glove = GloveWordEmbednigs()
glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt" glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove.path_file = glove_file glove.path_file = glove_file
#Load the Glove vectors file into memory, 2 index reserved
glove.load(2) #Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
number_features = 100000
begin_ofset = 3
glove.load(number_features, begin_ofset)
#Load the WASSA corpus #Load the WASSA corpus
...@@ -73,82 +76,89 @@ def tokenize(text): ...@@ -73,82 +76,89 @@ def tokenize(text):
def fit_transform_vocabulary(corpus): def fit_transform_vocabulary(corpus):
#generate vocabulary of corpus #generate vocabulary of corpus
vocabulary = {} #index 0: padding
#index 1: word not present in the embedding
#index 2: word magic (triggerword)
#corpus_indexes: index of each word of tweet in the embedding model
corpus_indexes = [] corpus_indexes = []
index = 1
for doc in corpus: for doc in corpus:
doc_indexes = [] tweet_indexes = []
tokens = tokenize(doc) tokens = tokenize(doc)
for token in tokens: for token in tokens:
if token not in vocabulary: if(token != "#triggerword"):
vocabulary[token] = index if(glove.is_word(token)):
doc_indexes.append(index) word_index_embedding = glove.get_word_index(token)
index += 1 tweet_indexes.append(word_index_embedding)
else:
index = 1
tweet_indexes.append(index)
else: else:
doc_indexes.append(vocabulary[token]) index = 2
tweet_indexes.append(index)
corpus_indexes.append(doc_indexes) corpus_indexes.append(tweet_indexes)
return (vocabulary, corpus_indexes)
return corpus_indexes
def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, tweets_dev): def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, tweets_dev):
#Classification with RNN and embedings (pre-trained) #Classification with RNN and embedings (pre-trained)
#calculate vocabulary #calculate vocabulary
vocabulary_train, corpus_train_index = fit_transform_vocabulary(tweets_train) corpus_train_index = fit_transform_vocabulary(tweets_train)
vocabulary_size = len(vocabulary_train) + 1 corpus_dev_index = fit_transform_vocabulary(tweets_dev)
max_len_input = int(np.average([len(tweet_train) for tweet_train in corpus_train_index], 0)) max_len_input = 40
#calculate index vocabulary in corpus dev train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
corpus_dev_index = [] padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
own_corpus_dev_index_append = corpus_dev_index.append
for tweet_dev in tweets_dev:
tokens_dev = tokenize(tweet_dev)
own_corpus_dev_index_append([vocabulary_train.get(token_dev, 0) for token_dev in tokens_dev])
print(type(own_corpus_dev_index_append))
# load pre-trained word embeddings into an Embedding layer
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, index in vocabulary_train.items():
word = word.lower()
if index > vocabulary_size - 1:
break
else:
embedding_vector = glove.word_indexes.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
#max_len_input = 30
train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_train_index[0][0]))
# define RNN model # define RNN model
model = Sequential() model = Sequential()
#assign special index
trigger_word_vector = 2 * 0.1 * np.random.rand(EMBEDDING_DIM) - 1
glove.set_embedding_vector(1, trigger_word_vector)
vector_word_not_present = 2 * 0.1 * np.random.rand(EMBEDDING_DIM) - 1
glove.set_embedding_vector(2, vector_word_not_present)
#number of features in embeddings model
feature_size = number_features + 3
embedding_matrix = np.zeros((feature_size, EMBEDDING_DIM))
for word, idx in glove.word_indexes.items():
embedding_vec = glove.get_word_embedding(word)
if embedding_vec is not None and embedding_vec.shape[0]==EMBEDDING_DIM:
embedding_matrix[idx] = np.asarray(embedding_vec)
#input_length: Length of input sequences, when it is constant #input_length: Length of input sequences, when it is constant
e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False) e = Embedding(feature_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
model.add(e) model.add(e)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim #number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model.add(LSTM(32)) model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
#model.add(Bidirectional(LSTM(2,dropout=0.2,recurrent_dropout=0.2,return_sequences=True)))
#model.add(Dense(32, activation='tanh')) model.add(Dense(32, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(len(CLASSES), activation='softmax')) model.add(Dense(len(CLASSES), activation='softmax'))
model.add(Activation('sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# summarize the model # summarize the model
print(model.summary()) print(model.summary())
# fit the model print("compilando modelo")
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print("entrenando")
model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=10, verbose=0) model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=10, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, verbose=0) loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, verbose=0)
print('Accuracy: %f' % (accuracy*100)) print('Accuracy: %f' % (accuracy*100))
padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_dev_index[0][0]))
#prediction
tweets_dev_classified_labels = model.predict_classes(padded_docs_dev, batch_size=32, verbose=1) tweets_dev_classified_labels = model.predict_classes(padded_docs_dev, batch_size=32, verbose=1)
return tweets_dev_classified_labels return tweets_dev_classified_labels
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment