Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Flor Miriam Plaza del Arco
/
WASSA 2018
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Merge Requests
0
Pipelines
Wiki
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
f80fe0a3
authored
Jun 28, 2018
by
Flor Miriam Plaza del Arco
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
Tercera versión script
parent
82e1a3eb
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
70 additions
and
58 deletions
corpus/.~lock.train-v2.csv#
embeddings_RNN.py
corpus/.~lock.train-v2.csv#
0 → 100644
View file @
f80fe0a3
,fmplaza,SINAI-155-1,27.06.2018 17:01,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
embeddings_RNN.py
View file @
f80fe0a3
...
...
@@ -5,22 +5,22 @@ Created on 20 jun. 2018
'''
import
os
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'2'
from
keras.datasets.imdb
import
get_word_index
from
model.glove_word_embedings
import
GloveWordEmbednigs
import
pandas
as
pd
from
nltk.tokenize.casual
import
TweetTokenizer
import
numpy
as
np
from
keras.preprocessing.text
import
Tokenizer
from
keras.preprocessing.sequence
import
pad_sequences
from
keras.models
import
Sequential
from
keras.layers
import
Dense
from
keras.layers
import
LSTM
from
keras.layers
import
Embedding
from
keras.layers
import
Dense
,
LSTM
,
Embedding
,
Bidirectional
,
Conv1D
,
GlobalAveragePooling1D
,
MaxPooling1D
,
Dropout
,
Activation
,
Flatten
from
mpl_toolkits.axes_grid1.axes_size
import
Padded
from
keras.utils
import
np_utils
from
sklearn
import
metrics
from
nltk.tokenize.casual
import
TweetTokenizer
from
keras.preprocessing
import
sequence
import
random
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'2'
TWEET_TOKENIZER
=
TweetTokenizer
(
preserve_case
=
False
,
reduce_len
=
True
,
strip_handles
=
False
)
...
...
@@ -31,8 +31,11 @@ EMBEDDING_DIM = 200
glove
=
GloveWordEmbednigs
()
glove_file
=
"./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove
.
path_file
=
glove_file
#Load the Glove vectors file into memory, 2 index reserved
glove
.
load
(
2
)
#Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
number_features
=
100000
begin_ofset
=
3
glove
.
load
(
number_features
,
begin_ofset
)
#Load the WASSA corpus
...
...
@@ -73,82 +76,89 @@ def tokenize(text):
def
fit_transform_vocabulary
(
corpus
):
#generate vocabulary of corpus
vocabulary
=
{}
#index 0: padding
#index 1: word not present in the embedding
#index 2: word magic (triggerword)
#corpus_indexes: index of each word of tweet in the embedding model
corpus_indexes
=
[]
index
=
1
for
doc
in
corpus
:
doc
_indexes
=
[]
tweet
_indexes
=
[]
tokens
=
tokenize
(
doc
)
for
token
in
tokens
:
if
token
not
in
vocabulary
:
vocabulary
[
token
]
=
index
doc_indexes
.
append
(
index
)
index
+=
1
if
(
token
!=
"#triggerword"
):
if
(
glove
.
is_word
(
token
)):
word_index_embedding
=
glove
.
get_word_index
(
token
)
tweet_indexes
.
append
(
word_index_embedding
)
else
:
index
=
1
tweet_indexes
.
append
(
index
)
else
:
doc_indexes
.
append
(
vocabulary
[
token
])
index
=
2
tweet_indexes
.
append
(
index
)
corpus_indexes
.
append
(
doc_indexes
)
return
(
vocabulary
,
corpus_indexes
)
corpus_indexes
.
append
(
tweet_indexes
)
return
corpus_indexes
def
classification_embedings_rnn
(
tweets_train
,
tweets_train_labels_numeric
,
tweets_dev
):
#Classification with RNN and embedings (pre-trained)
#calculate vocabulary
vocabulary_train
,
corpus_train_index
=
fit_transform_vocabulary
(
tweets_train
)
vocabulary_size
=
len
(
vocabulary_train
)
+
1
max_len_input
=
int
(
np
.
average
([
len
(
tweet_train
)
for
tweet_train
in
corpus_train_index
],
0
))
#calculate index vocabulary in corpus dev
corpus_dev_index
=
[]
own_corpus_dev_index_append
=
corpus_dev_index
.
append
for
tweet_dev
in
tweets_dev
:
tokens_dev
=
tokenize
(
tweet_dev
)
own_corpus_dev_index_append
([
vocabulary_train
.
get
(
token_dev
,
0
)
for
token_dev
in
tokens_dev
])
print
(
type
(
own_corpus_dev_index_append
))
# load pre-trained word embeddings into an Embedding layer
embedding_matrix
=
np
.
zeros
((
vocabulary_size
,
EMBEDDING_DIM
))
for
word
,
index
in
vocabulary_train
.
items
():
word
=
word
.
lower
()
if
index
>
vocabulary_size
-
1
:
break
else
:
embedding_vector
=
glove
.
word_indexes
.
get
(
word
)
if
embedding_vector
is
not
None
:
embedding_matrix
[
index
]
=
embedding_vector
#max_len_input = 30
train_features_pad
=
sequence
.
pad_sequences
(
corpus_train_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
dtype
=
type
(
corpus_train_index
[
0
][
0
]))
corpus_train_index
=
fit_transform_vocabulary
(
tweets_train
)
corpus_dev_index
=
fit_transform_vocabulary
(
tweets_dev
)
max_len_input
=
40
train_features_pad
=
sequence
.
pad_sequences
(
corpus_train_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
value
=
0
)
padded_docs_dev
=
sequence
.
pad_sequences
(
corpus_dev_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
value
=
0
)
# define RNN model
model
=
Sequential
()
#assign special index
trigger_word_vector
=
2
*
0.1
*
np
.
random
.
rand
(
EMBEDDING_DIM
)
-
1
glove
.
set_embedding_vector
(
1
,
trigger_word_vector
)
vector_word_not_present
=
2
*
0.1
*
np
.
random
.
rand
(
EMBEDDING_DIM
)
-
1
glove
.
set_embedding_vector
(
2
,
vector_word_not_present
)
#number of features in embeddings model
feature_size
=
number_features
+
3
embedding_matrix
=
np
.
zeros
((
feature_size
,
EMBEDDING_DIM
))
for
word
,
idx
in
glove
.
word_indexes
.
items
():
embedding_vec
=
glove
.
get_word_embedding
(
word
)
if
embedding_vec
is
not
None
and
embedding_vec
.
shape
[
0
]
==
EMBEDDING_DIM
:
embedding_matrix
[
idx
]
=
np
.
asarray
(
embedding_vec
)
#input_length: Length of input sequences, when it is constant
e
=
Embedding
(
vocabulary
_size
,
EMBEDDING_DIM
,
input_length
=
max_len_input
,
weights
=
[
embedding_matrix
],
trainable
=
False
)
e
=
Embedding
(
feature
_size
,
EMBEDDING_DIM
,
input_length
=
max_len_input
,
weights
=
[
embedding_matrix
],
trainable
=
False
)
model
.
add
(
e
)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model
.
add
(
LSTM
(
32
))
#model.add(Dense(32, activation='tanh'))
model
.
add
(
LSTM
(
128
,
dropout
=
0.2
,
recurrent_dropout
=
0.2
))
#model.add(Bidirectional(LSTM(2,dropout=0.2,recurrent_dropout=0.2,return_sequences=True)))
model
.
add
(
Dense
(
32
,
activation
=
'tanh'
))
model
.
add
(
Dropout
(
0.5
))
model
.
add
(
Dense
(
len
(
CLASSES
),
activation
=
'softmax'
))
# compile the model
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'categorical_crossentropy'
,
metrics
=
[
'acc'
])
model
.
add
(
Activation
(
'sigmoid'
))
# summarize the model
print
(
model
.
summary
())
# fit the model
print
(
"compilando modelo"
)
# compile the model
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'categorical_crossentropy'
,
metrics
=
[
'acc'
])
print
(
"entrenando"
)
model
.
fit
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
epochs
=
10
,
verbose
=
0
)
# evaluate the model
loss
,
accuracy
=
model
.
evaluate
(
train_features_pad
,
tweets_train_labels_numeric
,
verbose
=
0
)
print
(
'Accuracy:
%
f'
%
(
accuracy
*
100
))
padded_docs_dev
=
sequence
.
pad_sequences
(
corpus_dev_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
dtype
=
type
(
corpus_dev_index
[
0
][
0
]))
#prediction
tweets_dev_classified_labels
=
model
.
predict_classes
(
padded_docs_dev
,
batch_size
=
32
,
verbose
=
1
)
return
tweets_dev_classified_labels
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment