Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Flor Miriam Plaza del Arco
/
WASSA 2018
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Merge Requests
0
Pipelines
Wiki
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
eb0c57c1
authored
Jul 02, 2018
by
geni
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
LSTM sequences true; regularization, inialization, random seed; biltsm
parent
e0d559d6
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
33 additions
and
10 deletions
embeddings_RNN.py
embeddings_RNN.py
View file @
eb0c57c1
...
...
@@ -18,11 +18,19 @@ from keras.utils import np_utils
from
sklearn
import
metrics
from
nltk.tokenize.casual
import
TweetTokenizer
from
keras.preprocessing
import
sequence
from
keras.callbacks
import
EarlyStopping
from
keras.initializers
import
glorot_normal
,
glorot_uniform
from
keras
import
regularizers
import
random
from
tensorflow
import
set_random_seed
from
scipy
import
stats
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'2'
RANDOM_SEED
=
666
np
.
random
.
seed
(
RANDOM_SEED
)
set_random_seed
(
RANDOM_SEED
)
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'2'
TWEET_TOKENIZER
=
TweetTokenizer
(
preserve_case
=
False
,
reduce_len
=
True
,
strip_handles
=
False
)
CLASSES
=
[]
EMBEDDING_DIM
=
200
...
...
@@ -33,7 +41,7 @@ glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove
.
path_file
=
glove_file
#Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
number_features
=
1
00000
number_features
=
5
00000
begin_ofset
=
3
glove
.
load
(
number_features
,
begin_ofset
)
...
...
@@ -83,13 +91,17 @@ def fit_transform_vocabulary(corpus):
#corpus_indexes: index of each word of tweet in the embedding model
corpus_indexes
=
[]
corpus_lengths
=
[]
own_append_corpus_lengths
=
corpus_lengths
.
append
own_lower
=
str
.
lower
for
doc
in
corpus
:
tweet_indexes
=
[]
tokens
=
tokenize
(
doc
)
own_append_corpus_lengths
(
len
(
tokens
))
for
token
in
tokens
:
if
(
token
!=
"#triggerword"
):
if
(
glove
.
is_word
(
token
)):
word_index_embedding
=
glove
.
get_word_index
(
token
)
if
(
glove
.
is_word
(
own_lower
(
token
)
)):
word_index_embedding
=
glove
.
get_word_index
(
own_lower
(
token
)
)
tweet_indexes
.
append
(
word_index_embedding
)
else
:
index
=
1
...
...
@@ -101,6 +113,10 @@ def fit_transform_vocabulary(corpus):
corpus_indexes
.
append
(
tweet_indexes
)
print
(
np
.
max
(
corpus_lengths
))
print
(
np
.
mean
(
corpus_lengths
))
print
(
stats
.
mode
(
corpus_lengths
,
axis
=
0
))
return
corpus_indexes
...
...
@@ -111,7 +127,7 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
corpus_train_index
=
fit_transform_vocabulary
(
tweets_train
)
corpus_dev_index
=
fit_transform_vocabulary
(
tweets_dev
)
max_len_input
=
4
0
max_len_input
=
3
0
train_features_pad
=
sequence
.
pad_sequences
(
corpus_train_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
value
=
0
)
padded_docs_dev
=
sequence
.
pad_sequences
(
corpus_dev_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
value
=
0
)
...
...
@@ -140,10 +156,14 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
e
=
Embedding
(
feature_size
,
EMBEDDING_DIM
,
input_length
=
max_len_input
,
weights
=
[
embedding_matrix
],
trainable
=
False
)
model
.
add
(
e
)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model
.
add
(
LSTM
(
128
,
dropout
=
0.2
,
recurrent_dropout
=
0.2
))
#model.add(Bidirectional(LSTM(2,dropout=0.2,recurrent_dropout=0.2,return_sequences=True)))
model
.
add
(
Dense
(
32
,
activation
=
'relu'
))
model
.
add
(
Dense
(
32
,
activation
=
'relu'
))
model
.
add
(
LSTM
(
128
,
return_sequences
=
True
))
#model.add(Bidirectional(LSTM(128, return_sequences=True)))
model
.
add
(
Dense
(
64
,
activation
=
'relu'
,
kernel_initializer
=
glorot_uniform
(
seed
=
RANDOM_SEED
),
activity_regularizer
=
regularizers
.
l2
(
0.0001
)))
model
.
add
(
Dropout
(
0.25
))
model
.
add
(
Flatten
())
model
.
add
(
Dense
(
32
,
activation
=
'relu'
,
kernel_initializer
=
glorot_uniform
(
seed
=
RANDOM_SEED
),
activity_regularizer
=
regularizers
.
l2
(
0.0001
)))
model
.
add
(
Dropout
(
0.5
))
model
.
add
(
Dense
(
len
(
CLASSES
),
activation
=
'softmax'
))
...
...
@@ -154,7 +174,10 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
# compile the model
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'categorical_crossentropy'
,
metrics
=
[
'acc'
])
print
(
"Training the model..."
)
model
.
fit
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
epochs
=
50
,
verbose
=
1
,
validation_data
=
(
train_features_pad
,
tweets_train_labels_numeric
))
earlyStopping
=
EarlyStopping
(
'loss'
,
patience
=
5
,
mode
=
'min'
)
model
.
fit
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
epochs
=
50
,
verbose
=
1
,
validation_data
=
(
train_features_pad
,
tweets_train_labels_numeric
),
callbacks
=
[
earlyStopping
])
loss
,
accuracy
=
model
.
evaluate
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
verbose
=
1
)
print
(
'Accuracy trainning:
%
f'
%
(
accuracy
*
100
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment