Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Flor Miriam Plaza del Arco
/
WASSA 2018
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Merge Requests
0
Pipelines
Wiki
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
f80fe0a3
authored
Jun 28, 2018
by
Flor Miriam Plaza del Arco
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
Tercera versión script
parent
82e1a3eb
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
70 additions
and
58 deletions
corpus/.~lock.train-v2.csv#
embeddings_RNN.py
corpus/.~lock.train-v2.csv#
0 → 100644
View file @
f80fe0a3
,fmplaza,SINAI-155-1,27.06.2018 17:01,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
embeddings_RNN.py
View file @
f80fe0a3
...
@@ -5,22 +5,22 @@ Created on 20 jun. 2018
...
@@ -5,22 +5,22 @@ Created on 20 jun. 2018
'''
'''
import
os
import
os
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'2'
from
keras.datasets.imdb
import
get_word_index
from
model.glove_word_embedings
import
GloveWordEmbednigs
from
model.glove_word_embedings
import
GloveWordEmbednigs
import
pandas
as
pd
import
pandas
as
pd
from
nltk.tokenize.casual
import
TweetTokenizer
from
nltk.tokenize.casual
import
TweetTokenizer
import
numpy
as
np
import
numpy
as
np
from
keras.preprocessing.text
import
Tokenizer
from
keras.preprocessing.sequence
import
pad_sequences
from
keras.preprocessing.sequence
import
pad_sequences
from
keras.models
import
Sequential
from
keras.models
import
Sequential
from
keras.layers
import
Dense
from
keras.layers
import
Dense
,
LSTM
,
Embedding
,
Bidirectional
,
Conv1D
,
GlobalAveragePooling1D
,
MaxPooling1D
,
Dropout
,
Activation
,
Flatten
from
keras.layers
import
LSTM
from
keras.layers
import
Embedding
from
mpl_toolkits.axes_grid1.axes_size
import
Padded
from
mpl_toolkits.axes_grid1.axes_size
import
Padded
from
keras.utils
import
np_utils
from
keras.utils
import
np_utils
from
sklearn
import
metrics
from
sklearn
import
metrics
from
nltk.tokenize.casual
import
TweetTokenizer
from
nltk.tokenize.casual
import
TweetTokenizer
from
keras.preprocessing
import
sequence
from
keras.preprocessing
import
sequence
import
random
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'2'
TWEET_TOKENIZER
=
TweetTokenizer
(
preserve_case
=
False
,
reduce_len
=
True
,
strip_handles
=
False
)
TWEET_TOKENIZER
=
TweetTokenizer
(
preserve_case
=
False
,
reduce_len
=
True
,
strip_handles
=
False
)
...
@@ -31,8 +31,11 @@ EMBEDDING_DIM = 200
...
@@ -31,8 +31,11 @@ EMBEDDING_DIM = 200
glove
=
GloveWordEmbednigs
()
glove
=
GloveWordEmbednigs
()
glove_file
=
"./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove_file
=
"./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove
.
path_file
=
glove_file
glove
.
path_file
=
glove_file
#Load the Glove vectors file into memory, 2 index reserved
glove
.
load
(
2
)
#Load the Glove vectors file into memory, 3 index reserved (0: paddind, 1: word not present in embedding, 2: magic word)
number_features
=
100000
begin_ofset
=
3
glove
.
load
(
number_features
,
begin_ofset
)
#Load the WASSA corpus
#Load the WASSA corpus
...
@@ -73,82 +76,89 @@ def tokenize(text):
...
@@ -73,82 +76,89 @@ def tokenize(text):
def
fit_transform_vocabulary
(
corpus
):
def
fit_transform_vocabulary
(
corpus
):
#generate vocabulary of corpus
#generate vocabulary of corpus
vocabulary
=
{}
#index 0: padding
#index 1: word not present in the embedding
#index 2: word magic (triggerword)
#corpus_indexes: index of each word of tweet in the embedding model
corpus_indexes
=
[]
corpus_indexes
=
[]
index
=
1
for
doc
in
corpus
:
for
doc
in
corpus
:
doc
_indexes
=
[]
tweet
_indexes
=
[]
tokens
=
tokenize
(
doc
)
tokens
=
tokenize
(
doc
)
for
token
in
tokens
:
for
token
in
tokens
:
if
token
not
in
vocabulary
:
if
(
token
!=
"#triggerword"
):
vocabulary
[
token
]
=
index
if
(
glove
.
is_word
(
token
)):
doc_indexes
.
append
(
index
)
word_index_embedding
=
glove
.
get_word_index
(
token
)
index
+=
1
tweet_indexes
.
append
(
word_index_embedding
)
else
:
index
=
1
tweet_indexes
.
append
(
index
)
else
:
else
:
doc_indexes
.
append
(
vocabulary
[
token
])
index
=
2
tweet_indexes
.
append
(
index
)
corpus_indexes
.
append
(
doc_indexes
)
corpus_indexes
.
append
(
tweet_indexes
)
return
(
vocabulary
,
corpus_indexes
)
return
corpus_indexes
def
classification_embedings_rnn
(
tweets_train
,
tweets_train_labels_numeric
,
tweets_dev
):
def
classification_embedings_rnn
(
tweets_train
,
tweets_train_labels_numeric
,
tweets_dev
):
#Classification with RNN and embedings (pre-trained)
#Classification with RNN and embedings (pre-trained)
#calculate vocabulary
#calculate vocabulary
vocabulary_train
,
corpus_train_index
=
fit_transform_vocabulary
(
tweets_train
)
corpus_train_index
=
fit_transform_vocabulary
(
tweets_train
)
vocabulary_size
=
len
(
vocabulary_train
)
+
1
corpus_dev_index
=
fit_transform_vocabulary
(
tweets_dev
)
max_len_input
=
int
(
np
.
average
([
len
(
tweet_train
)
for
tweet_train
in
corpus_train_index
],
0
))
max_len_input
=
40
#calculate index vocabulary in corpus dev
train_features_pad
=
sequence
.
pad_sequences
(
corpus_train_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
value
=
0
)
corpus_dev_index
=
[]
padded_docs_dev
=
sequence
.
pad_sequences
(
corpus_dev_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
value
=
0
)
own_corpus_dev_index_append
=
corpus_dev_index
.
append
for
tweet_dev
in
tweets_dev
:
tokens_dev
=
tokenize
(
tweet_dev
)
own_corpus_dev_index_append
([
vocabulary_train
.
get
(
token_dev
,
0
)
for
token_dev
in
tokens_dev
])
print
(
type
(
own_corpus_dev_index_append
))
# load pre-trained word embeddings into an Embedding layer
embedding_matrix
=
np
.
zeros
((
vocabulary_size
,
EMBEDDING_DIM
))
for
word
,
index
in
vocabulary_train
.
items
():
word
=
word
.
lower
()
if
index
>
vocabulary_size
-
1
:
break
else
:
embedding_vector
=
glove
.
word_indexes
.
get
(
word
)
if
embedding_vector
is
not
None
:
embedding_matrix
[
index
]
=
embedding_vector
#max_len_input = 30
train_features_pad
=
sequence
.
pad_sequences
(
corpus_train_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
dtype
=
type
(
corpus_train_index
[
0
][
0
]))
# define RNN model
# define RNN model
model
=
Sequential
()
model
=
Sequential
()
#assign special index
trigger_word_vector
=
2
*
0.1
*
np
.
random
.
rand
(
EMBEDDING_DIM
)
-
1
glove
.
set_embedding_vector
(
1
,
trigger_word_vector
)
vector_word_not_present
=
2
*
0.1
*
np
.
random
.
rand
(
EMBEDDING_DIM
)
-
1
glove
.
set_embedding_vector
(
2
,
vector_word_not_present
)
#number of features in embeddings model
feature_size
=
number_features
+
3
embedding_matrix
=
np
.
zeros
((
feature_size
,
EMBEDDING_DIM
))
for
word
,
idx
in
glove
.
word_indexes
.
items
():
embedding_vec
=
glove
.
get_word_embedding
(
word
)
if
embedding_vec
is
not
None
and
embedding_vec
.
shape
[
0
]
==
EMBEDDING_DIM
:
embedding_matrix
[
idx
]
=
np
.
asarray
(
embedding_vec
)
#input_length: Length of input sequences, when it is constant
#input_length: Length of input sequences, when it is constant
e
=
Embedding
(
vocabulary
_size
,
EMBEDDING_DIM
,
input_length
=
max_len_input
,
weights
=
[
embedding_matrix
],
trainable
=
False
)
e
=
Embedding
(
feature
_size
,
EMBEDDING_DIM
,
input_length
=
max_len_input
,
weights
=
[
embedding_matrix
],
trainable
=
False
)
model
.
add
(
e
)
model
.
add
(
e
)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model
.
add
(
LSTM
(
32
))
model
.
add
(
LSTM
(
128
,
dropout
=
0.2
,
recurrent_dropout
=
0.2
))
#model.add(Bidirectional(LSTM(2,dropout=0.2,recurrent_dropout=0.2,return_sequences=True)))
#model.add(Dense(32, activation='tanh'))
model
.
add
(
Dense
(
32
,
activation
=
'tanh'
))
model
.
add
(
Dropout
(
0.5
))
model
.
add
(
Dense
(
len
(
CLASSES
),
activation
=
'softmax'
))
model
.
add
(
Dense
(
len
(
CLASSES
),
activation
=
'softmax'
))
model
.
add
(
Activation
(
'sigmoid'
))
# compile the model
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'categorical_crossentropy'
,
metrics
=
[
'acc'
])
# summarize the model
# summarize the model
print
(
model
.
summary
())
print
(
model
.
summary
())
# fit the model
print
(
"compilando modelo"
)
# compile the model
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'categorical_crossentropy'
,
metrics
=
[
'acc'
])
print
(
"entrenando"
)
model
.
fit
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
epochs
=
10
,
verbose
=
0
)
model
.
fit
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
epochs
=
10
,
verbose
=
0
)
# evaluate the model
loss
,
accuracy
=
model
.
evaluate
(
train_features_pad
,
tweets_train_labels_numeric
,
verbose
=
0
)
loss
,
accuracy
=
model
.
evaluate
(
train_features_pad
,
tweets_train_labels_numeric
,
verbose
=
0
)
print
(
'Accuracy:
%
f'
%
(
accuracy
*
100
))
print
(
'Accuracy:
%
f'
%
(
accuracy
*
100
))
padded_docs_dev
=
sequence
.
pad_sequences
(
corpus_dev_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
dtype
=
type
(
corpus_dev_index
[
0
][
0
]))
#prediction
tweets_dev_classified_labels
=
model
.
predict_classes
(
padded_docs_dev
,
batch_size
=
32
,
verbose
=
1
)
tweets_dev_classified_labels
=
model
.
predict_classes
(
padded_docs_dev
,
batch_size
=
32
,
verbose
=
1
)
return
tweets_dev_classified_labels
return
tweets_dev_classified_labels
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment