Commit eb0c57c1 by geni

LSTM sequences true; regularization, inialization, random seed; biltsm

parent e0d559d6
Showing with 33 additions and 10 deletions
......@@ -18,11 +18,19 @@ from keras.utils import np_utils
from sklearn import metrics
from nltk.tokenize.casual import TweetTokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.initializers import glorot_normal, glorot_uniform
from keras import regularizers
import random
from tensorflow import set_random_seed
from scipy import stats
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
RANDOM_SEED = 666
np.random.seed(RANDOM_SEED)
set_random_seed(RANDOM_SEED)
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
CLASSES = []
EMBEDDING_DIM = 200
......@@ -33,7 +41,7 @@ glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove.path_file = glove_file
#Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
number_features = 100000
number_features = 500000
begin_ofset = 3
glove.load(number_features, begin_ofset)
......@@ -83,13 +91,17 @@ def fit_transform_vocabulary(corpus):
#corpus_indexes: index of each word of tweet in the embedding model
corpus_indexes = []
corpus_lengths = []
own_append_corpus_lengths = corpus_lengths.append
own_lower = str.lower
for doc in corpus:
tweet_indexes = []
tokens = tokenize(doc)
own_append_corpus_lengths(len(tokens))
for token in tokens:
if(token != "#triggerword"):
if(glove.is_word(token)):
word_index_embedding = glove.get_word_index(token)
if(glove.is_word(own_lower(token))):
word_index_embedding = glove.get_word_index(own_lower(token))
tweet_indexes.append(word_index_embedding)
else:
index = 1
......@@ -101,6 +113,10 @@ def fit_transform_vocabulary(corpus):
corpus_indexes.append(tweet_indexes)
print(np.max(corpus_lengths))
print(np.mean(corpus_lengths))
print(stats.mode(corpus_lengths, axis=0))
return corpus_indexes
......@@ -111,7 +127,7 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
corpus_train_index = fit_transform_vocabulary(tweets_train)
corpus_dev_index = fit_transform_vocabulary(tweets_dev)
max_len_input = 40
max_len_input = 30
train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
......@@ -140,10 +156,14 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
e = Embedding(feature_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
model.add(e)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
#model.add(Bidirectional(LSTM(2,dropout=0.2,recurrent_dropout=0.2,return_sequences=True)))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(LSTM(128, return_sequences=True))
#model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dense(64, activation='relu', kernel_initializer=glorot_uniform(seed=RANDOM_SEED), activity_regularizer=regularizers.l2(0.0001)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(32, activation='relu', kernel_initializer=glorot_uniform(seed=RANDOM_SEED), activity_regularizer=regularizers.l2(0.0001)))
model.add(Dropout(0.5))
model.add(Dense(len(CLASSES), activation='softmax'))
......@@ -154,7 +174,10 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print("Training the model...")
model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=50, verbose=1, validation_data=(train_features_pad,tweets_train_labels_numeric))
earlyStopping = EarlyStopping('loss', patience=5, mode='min')
model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=50, verbose=1, validation_data=(train_features_pad,tweets_train_labels_numeric), callbacks=[earlyStopping])
loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, batch_size=32, verbose=1)
print('Accuracy trainning: %f' % (accuracy*100))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment