Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Flor Miriam Plaza del Arco
/
WASSA 2018
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Merge Requests
0
Pipelines
Wiki
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
eb0c57c1
authored
Jul 02, 2018
by
geni
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
LSTM sequences true; regularization, inialization, random seed; biltsm
parent
e0d559d6
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
33 additions
and
10 deletions
embeddings_RNN.py
embeddings_RNN.py
View file @
eb0c57c1
...
@@ -18,11 +18,19 @@ from keras.utils import np_utils
...
@@ -18,11 +18,19 @@ from keras.utils import np_utils
from
sklearn
import
metrics
from
sklearn
import
metrics
from
nltk.tokenize.casual
import
TweetTokenizer
from
nltk.tokenize.casual
import
TweetTokenizer
from
keras.preprocessing
import
sequence
from
keras.preprocessing
import
sequence
from
keras.callbacks
import
EarlyStopping
from
keras.initializers
import
glorot_normal
,
glorot_uniform
from
keras
import
regularizers
import
random
import
random
from
tensorflow
import
set_random_seed
from
scipy
import
stats
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'2'
RANDOM_SEED
=
666
np
.
random
.
seed
(
RANDOM_SEED
)
set_random_seed
(
RANDOM_SEED
)
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'2'
TWEET_TOKENIZER
=
TweetTokenizer
(
preserve_case
=
False
,
reduce_len
=
True
,
strip_handles
=
False
)
TWEET_TOKENIZER
=
TweetTokenizer
(
preserve_case
=
False
,
reduce_len
=
True
,
strip_handles
=
False
)
CLASSES
=
[]
CLASSES
=
[]
EMBEDDING_DIM
=
200
EMBEDDING_DIM
=
200
...
@@ -33,7 +41,7 @@ glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
...
@@ -33,7 +41,7 @@ glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove
.
path_file
=
glove_file
glove
.
path_file
=
glove_file
#Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
#Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
number_features
=
1
00000
number_features
=
5
00000
begin_ofset
=
3
begin_ofset
=
3
glove
.
load
(
number_features
,
begin_ofset
)
glove
.
load
(
number_features
,
begin_ofset
)
...
@@ -83,13 +91,17 @@ def fit_transform_vocabulary(corpus):
...
@@ -83,13 +91,17 @@ def fit_transform_vocabulary(corpus):
#corpus_indexes: index of each word of tweet in the embedding model
#corpus_indexes: index of each word of tweet in the embedding model
corpus_indexes
=
[]
corpus_indexes
=
[]
corpus_lengths
=
[]
own_append_corpus_lengths
=
corpus_lengths
.
append
own_lower
=
str
.
lower
for
doc
in
corpus
:
for
doc
in
corpus
:
tweet_indexes
=
[]
tweet_indexes
=
[]
tokens
=
tokenize
(
doc
)
tokens
=
tokenize
(
doc
)
own_append_corpus_lengths
(
len
(
tokens
))
for
token
in
tokens
:
for
token
in
tokens
:
if
(
token
!=
"#triggerword"
):
if
(
token
!=
"#triggerword"
):
if
(
glove
.
is_word
(
token
)):
if
(
glove
.
is_word
(
own_lower
(
token
)
)):
word_index_embedding
=
glove
.
get_word_index
(
token
)
word_index_embedding
=
glove
.
get_word_index
(
own_lower
(
token
)
)
tweet_indexes
.
append
(
word_index_embedding
)
tweet_indexes
.
append
(
word_index_embedding
)
else
:
else
:
index
=
1
index
=
1
...
@@ -101,6 +113,10 @@ def fit_transform_vocabulary(corpus):
...
@@ -101,6 +113,10 @@ def fit_transform_vocabulary(corpus):
corpus_indexes
.
append
(
tweet_indexes
)
corpus_indexes
.
append
(
tweet_indexes
)
print
(
np
.
max
(
corpus_lengths
))
print
(
np
.
mean
(
corpus_lengths
))
print
(
stats
.
mode
(
corpus_lengths
,
axis
=
0
))
return
corpus_indexes
return
corpus_indexes
...
@@ -111,7 +127,7 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
...
@@ -111,7 +127,7 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
corpus_train_index
=
fit_transform_vocabulary
(
tweets_train
)
corpus_train_index
=
fit_transform_vocabulary
(
tweets_train
)
corpus_dev_index
=
fit_transform_vocabulary
(
tweets_dev
)
corpus_dev_index
=
fit_transform_vocabulary
(
tweets_dev
)
max_len_input
=
4
0
max_len_input
=
3
0
train_features_pad
=
sequence
.
pad_sequences
(
corpus_train_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
value
=
0
)
train_features_pad
=
sequence
.
pad_sequences
(
corpus_train_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
value
=
0
)
padded_docs_dev
=
sequence
.
pad_sequences
(
corpus_dev_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
value
=
0
)
padded_docs_dev
=
sequence
.
pad_sequences
(
corpus_dev_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
value
=
0
)
...
@@ -140,10 +156,14 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
...
@@ -140,10 +156,14 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
e
=
Embedding
(
feature_size
,
EMBEDDING_DIM
,
input_length
=
max_len_input
,
weights
=
[
embedding_matrix
],
trainable
=
False
)
e
=
Embedding
(
feature_size
,
EMBEDDING_DIM
,
input_length
=
max_len_input
,
weights
=
[
embedding_matrix
],
trainable
=
False
)
model
.
add
(
e
)
model
.
add
(
e
)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model
.
add
(
LSTM
(
128
,
dropout
=
0.2
,
recurrent_dropout
=
0.2
))
#model.add(Bidirectional(LSTM(2,dropout=0.2,recurrent_dropout=0.2,return_sequences=True)))
model
.
add
(
LSTM
(
128
,
return_sequences
=
True
))
model
.
add
(
Dense
(
32
,
activation
=
'relu'
))
#model.add(Bidirectional(LSTM(128, return_sequences=True)))
model
.
add
(
Dense
(
32
,
activation
=
'relu'
))
model
.
add
(
Dense
(
64
,
activation
=
'relu'
,
kernel_initializer
=
glorot_uniform
(
seed
=
RANDOM_SEED
),
activity_regularizer
=
regularizers
.
l2
(
0.0001
)))
model
.
add
(
Dropout
(
0.25
))
model
.
add
(
Flatten
())
model
.
add
(
Dense
(
32
,
activation
=
'relu'
,
kernel_initializer
=
glorot_uniform
(
seed
=
RANDOM_SEED
),
activity_regularizer
=
regularizers
.
l2
(
0.0001
)))
model
.
add
(
Dropout
(
0.5
))
model
.
add
(
Dropout
(
0.5
))
model
.
add
(
Dense
(
len
(
CLASSES
),
activation
=
'softmax'
))
model
.
add
(
Dense
(
len
(
CLASSES
),
activation
=
'softmax'
))
...
@@ -154,7 +174,10 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
...
@@ -154,7 +174,10 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, twee
# compile the model
# compile the model
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'categorical_crossentropy'
,
metrics
=
[
'acc'
])
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'categorical_crossentropy'
,
metrics
=
[
'acc'
])
print
(
"Training the model..."
)
print
(
"Training the model..."
)
model
.
fit
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
epochs
=
50
,
verbose
=
1
,
validation_data
=
(
train_features_pad
,
tweets_train_labels_numeric
))
earlyStopping
=
EarlyStopping
(
'loss'
,
patience
=
5
,
mode
=
'min'
)
model
.
fit
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
epochs
=
50
,
verbose
=
1
,
validation_data
=
(
train_features_pad
,
tweets_train_labels_numeric
),
callbacks
=
[
earlyStopping
])
loss
,
accuracy
=
model
.
evaluate
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
verbose
=
1
)
loss
,
accuracy
=
model
.
evaluate
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
verbose
=
1
)
print
(
'Accuracy trainning:
%
f'
%
(
accuracy
*
100
))
print
(
'Accuracy trainning:
%
f'
%
(
accuracy
*
100
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment