Commit eb0c57c1 by geni

LSTM sequences true; regularization, inialization, random seed; biltsm

parent e0d559d6
Showing with 33 additions and 10 deletions
...@@ -18,11 +18,19 @@ from keras.utils import np_utils ...@@ -18,11 +18,19 @@ from keras.utils import np_utils
from sklearn import metrics from sklearn import metrics
from nltk.tokenize.casual import TweetTokenizer from nltk.tokenize.casual import TweetTokenizer
from keras.preprocessing import sequence from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.initializers import glorot_normal, glorot_uniform
from keras import regularizers
import random import random
from tensorflow import set_random_seed
from scipy import stats
os.environ['TF_CPP_MIN_LOG_LEVEL']='2' RANDOM_SEED = 666
np.random.seed(RANDOM_SEED)
set_random_seed(RANDOM_SEED)
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False) TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
CLASSES = [] CLASSES = []
EMBEDDING_DIM = 200 EMBEDDING_DIM = 200
...@@ -33,7 +41,7 @@ glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt" ...@@ -33,7 +41,7 @@ glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove.path_file = glove_file glove.path_file = glove_file
#Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word) #Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
number_features = 100000 number_features = 500000
begin_ofset = 3 begin_ofset = 3
glove.load(number_features, begin_ofset) glove.load(number_features, begin_ofset)
...@@ -83,13 +91,17 @@ def fit_transform_vocabulary(corpus): ...@@ -83,13 +91,17 @@ def fit_transform_vocabulary(corpus):
#corpus_indexes: index of each word of tweet in the embedding model #corpus_indexes: index of each word of tweet in the embedding model
corpus_indexes = [] corpus_indexes = []
corpus_lengths = []
own_append_corpus_lengths = corpus_lengths.append
own_lower = str.lower
for doc in corpus: for doc in corpus:
tweet_indexes = [] tweet_indexes = []
tokens = tokenize(doc) tokens = tokenize(doc)
own_append_corpus_lengths(len(tokens))
for token in tokens: for token in tokens:
if(token != "#triggerword"): if(token != "#triggerword"):
if(glove.is_word(token)): if(glove.is_word(own_lower(token))):
word_index_embedding = glove.get_word_index(token) word_index_embedding = glove.get_word_index(own_lower(token))
tweet_indexes.append(word_index_embedding) tweet_indexes.append(word_index_embedding)
else: else:
index = 1 index = 1
...@@ -101,6 +113,10 @@ def fit_transform_vocabulary(corpus): ...@@ -101,6 +113,10 @@ def fit_transform_vocabulary(corpus):
corpus_indexes.append(tweet_indexes) corpus_indexes.append(tweet_indexes)
print(np.max(corpus_lengths))
print(np.mean(corpus_lengths))
print(stats.mode(corpus_lengths, axis=0))
return corpus_indexes return corpus_indexes
...@@ -111,7 +127,7 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee ...@@ -111,7 +127,7 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
corpus_train_index = fit_transform_vocabulary(tweets_train) corpus_train_index = fit_transform_vocabulary(tweets_train)
corpus_dev_index = fit_transform_vocabulary(tweets_dev) corpus_dev_index = fit_transform_vocabulary(tweets_dev)
max_len_input = 40 max_len_input = 30
train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", value = 0) train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", value = 0) padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
...@@ -140,10 +156,14 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee ...@@ -140,10 +156,14 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
e = Embedding(feature_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False) e = Embedding(feature_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
model.add(e) model.add(e)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim #number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
#model.add(Bidirectional(LSTM(2,dropout=0.2,recurrent_dropout=0.2,return_sequences=True))) model.add(LSTM(128, return_sequences=True))
model.add(Dense(32, activation='relu')) #model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dense(32, activation='relu'))
model.add(Dense(64, activation='relu', kernel_initializer=glorot_uniform(seed=RANDOM_SEED), activity_regularizer=regularizers.l2(0.0001)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(32, activation='relu', kernel_initializer=glorot_uniform(seed=RANDOM_SEED), activity_regularizer=regularizers.l2(0.0001)))
model.add(Dropout(0.5)) model.add(Dropout(0.5))
model.add(Dense(len(CLASSES), activation='softmax')) model.add(Dense(len(CLASSES), activation='softmax'))
...@@ -154,7 +174,10 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee ...@@ -154,7 +174,10 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
# compile the model # compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc']) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print("Training the model...") print("Training the model...")
model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=50, verbose=1, validation_data=(train_features_pad,tweets_train_labels_numeric))
earlyStopping = EarlyStopping('loss', patience=5, mode='min')
model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=50, verbose=1, validation_data=(train_features_pad,tweets_train_labels_numeric), callbacks=[earlyStopping])
loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, batch_size=32, verbose=1) loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, batch_size=32, verbose=1)
print('Accuracy trainning: %f' % (accuracy*100)) print('Accuracy trainning: %f' % (accuracy*100))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment