LSTM sequences true; regularization, inialization, random seed; biltsm

eb0c57c1 · geni · e0d559d6 · eb0c57c1
Commit eb0c57c1 authored Jul 02, 2018 by geni
Showing with 33 additions and 10 deletions
embeddings_RNN.py
--- a/embeddings_RNN.py
+++ b/embeddings_RNN.py
@@ -18,11 +18,19 @@ from keras.utils import np_utils
 from sklearn import metrics
 from nltk.tokenize.casual import TweetTokenizer
 from keras.preprocessing import sequence
+from keras.callbacks import EarlyStopping
+from keras.initializers import glorot_normal, glorot_uniform
+from keras import regularizers
 import random
+from tensorflow import set_random_seed
+from scipy import stats

-os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
+RANDOM_SEED = 666

+np.random.seed(RANDOM_SEED)
+set_random_seed(RANDOM_SEED)

+os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
 CLASSES = []
 EMBEDDING_DIM = 200
@@ -33,7 +41,7 @@ glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
 glove.path_file = glove_file

 #Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
-number_features = 100000
+number_features = 500000
 begin_ofset = 3
 glove.load(number_features, begin_ofset)

@@ -83,13 +91,17 @@ def fit_transform_vocabulary(corpus):
    #corpus_indexes: index of each word of tweet in the embedding model
      
    corpus_indexes = []
+    corpus_lengths = []
+    own_append_corpus_lengths = corpus_lengths.append
+    own_lower = str.lower
    for doc in corpus:
        tweet_indexes = []
        tokens = tokenize(doc)
+        own_append_corpus_lengths(len(tokens))
        for token in tokens:
            if(token != "#triggerword"):
-                if(glove.is_word(token)):
-                    word_index_embedding = glove.get_word_index(token)
+                if(glove.is_word(own_lower(token))):
+                    word_index_embedding = glove.get_word_index(own_lower(token))
                    tweet_indexes.append(word_index_embedding)
                else:
                    index = 1
@@ -101,6 +113,10 @@ def fit_transform_vocabulary(corpus):
                
        corpus_indexes.append(tweet_indexes)
    
+    
+    print(np.max(corpus_lengths))
+    print(np.mean(corpus_lengths))
+    print(stats.mode(corpus_lengths, axis=0))
    return corpus_indexes


@@ -111,7 +127,7 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
    corpus_train_index = fit_transform_vocabulary(tweets_train)
    corpus_dev_index = fit_transform_vocabulary(tweets_dev)

-    max_len_input = 40
+    max_len_input = 30
               
    train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
    padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", value = 0)
@@ -140,10 +156,14 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
    e = Embedding(feature_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
    model.add(e)
    #number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
-    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
-    #model.add(Bidirectional(LSTM(2,dropout=0.2,recurrent_dropout=0.2,return_sequences=True)))
-    model.add(Dense(32, activation='relu'))
-    model.add(Dense(32, activation='relu'))
+    
+    model.add(LSTM(128, return_sequences=True))
+    #model.add(Bidirectional(LSTM(128, return_sequences=True)))
+    
+    model.add(Dense(64, activation='relu', kernel_initializer=glorot_uniform(seed=RANDOM_SEED), activity_regularizer=regularizers.l2(0.0001)))
+    model.add(Dropout(0.25))
+    model.add(Flatten())
+    model.add(Dense(32, activation='relu', kernel_initializer=glorot_uniform(seed=RANDOM_SEED), activity_regularizer=regularizers.l2(0.0001)))
    model.add(Dropout(0.5))
    model.add(Dense(len(CLASSES), activation='softmax'))
    
@@ -154,7 +174,10 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
    # compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
    print("Training the model...")
-    model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=50, verbose=1, validation_data=(train_features_pad,tweets_train_labels_numeric))
+    
+    earlyStopping = EarlyStopping('loss', patience=5, mode='min')
+    
+    model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=50, verbose=1, validation_data=(train_features_pad,tweets_train_labels_numeric), callbacks=[earlyStopping])
    loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, batch_size=32, verbose=1)
    print('Accuracy trainning: %f' % (accuracy*100))