Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Flor Miriam Plaza del Arco
/
WASSA 2018
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Merge Requests
0
Pipelines
Wiki
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
7373f811
authored
Jun 21, 2018
by
Flor Miriam Plaza del Arco
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
Segunda aproximación programa RNN
parent
dbb12602
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
163 additions
and
28 deletions
corpus/.~lock.train-v2.csv#
embeddings_RNN.py
position_trigger_word.py
corpus/.~lock.train-v2.csv#
deleted
100644 → 0
View file @
dbb12602
,fmplaza,SINAI-155-1,18.06.2018 19:22,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
embeddings_RNN.py
View file @
7373f811
...
...
@@ -4,6 +4,8 @@ Created on 20 jun. 2018
@author: fmplaza
'''
import
os
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'2'
from
model.glove_word_embedings
import
GloveWordEmbednigs
import
pandas
as
pd
from
nltk.tokenize.casual
import
TweetTokenizer
...
...
@@ -15,88 +17,163 @@ from keras.layers import Dense
from
keras.layers
import
LSTM
from
keras.layers
import
Embedding
from
mpl_toolkits.axes_grid1.axes_size
import
Padded
from
keras.utils
import
np_utils
from
sklearn
import
metrics
from
nltk.tokenize.casual
import
TweetTokenizer
from
keras.preprocessing
import
sequence
TWEET_TOKENIZER
=
TweetTokenizer
(
preserve_case
=
False
,
reduce_len
=
True
,
strip_handles
=
False
)
CLASSES
=
[]
EMBEDDING_DIM
=
200
#load GloVe model
glove
=
GloveWordEmbednigs
()
glove_file
=
"./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove
.
path_file
=
glove_file
#Load the Glove vectors file into memory, 2 index reserved
glove
.
load
(
2
)
#Load the WASSA corpus
def
read_corpus
():
labels
=
[]
classes_append
=
CLASSES
.
append
tweets_train_labels_numeric
=
[]
tweets_dev_labels_numeric
=
[]
tweets_train
=
pd
.
read_csv
(
'./corpus/train-v2.csv'
,
sep
=
"
\t
"
,
header
=
0
)
tweets_train_labels
=
tweets_train
[
'emotion'
]
tweets_dev
=
pd
.
read_csv
(
'./corpus/trial-v2.csv'
,
sep
=
"
\t
"
,
header
=
0
)
tweets_dev_labels
=
pd
.
read_csv
(
'./corpus/trial-v2.labels'
,
sep
=
"
\t
"
,
header
=
0
)
tweets_dev_labels
=
tweets_dev_labels
[
'emotion'
]
#convert categorical labels into numeric labels
for
label
in
tweets_train_labels
.
tolist
():
if
(
label
not
in
CLASSES
):
classes_append
(
label
)
tweets_train_labels_numeric
.
append
(
CLASSES
.
index
(
label
))
for
label
in
tweets_dev_labels
.
tolist
():
tweets_dev_labels_numeric
.
append
(
CLASSES
.
index
(
label
))
tweets_train_labels_numeric
=
np_utils
.
to_categorical
(
tweets_train_labels_numeric
)
return
tweets_train
.
tweet
,
tweets_train_labels_numeric
,
tweets_dev
.
tweet
,
tweets_dev_labels_numeric
def
tokenize
(
text
):
#preprocessing data
text_tokenized
=
TWEET_TOKENIZER
.
tokenize
(
text
)
return
text_tokenized
return
tweets_train
.
tweet
,
tweets_train_labels_numeric
,
tweets_dev
.
tweet
,
tweets_dev_labels
#Classification with RNN and embedings (pre-trained)
def
classification_embedings_rnn
(
tweets_train
,
tweets_train_labels_numeric
):
def
fit_transform_vocabulary
(
corpus
):
#generate vocabulary of corpus
#load the whole embedding into memory, 2 index reserved
glove
.
load
(
2
)
vocabulary
=
{}
corpus_indexes
=
[]
index
=
1
for
doc
in
corpus
:
doc_indexes
=
[]
tokens
=
tokenize
(
doc
)
for
token
in
tokens
:
if
token
not
in
vocabulary
:
vocabulary
[
token
]
=
index
doc_indexes
.
append
(
index
)
index
+=
1
else
:
doc_indexes
.
append
(
vocabulary
[
token
])
corpus_indexes
.
append
(
doc_indexes
)
return
(
vocabulary
,
corpus_indexes
)
def
classification_embedings_rnn
(
tweets_train
,
tweets_train_labels_numeric
,
tweets_dev
):
#Classification with RNN and embedings (pre-trained)
#calculate vocabulary
vocabulary_train
,
corpus_train_index
=
fit_transform_vocabulary
(
tweets_train
)
vocabulary_size
=
len
(
vocabulary_train
)
+
1
max_len_input
=
int
(
np
.
average
([
len
(
tweet_train
)
for
tweet_train
in
corpus_train_index
],
0
))
#preprocessing tweets train
tokenizer
=
Tokenizer
()
tokenizer
.
fit_on_texts
(
tweets_train
)
#calculate index vocabulary in corpus dev
corpus_dev_index
=
[]
own_corpus_dev_index_append
=
corpus_dev_index
.
append
for
tweet_dev
in
tweets_dev
:
tokens_dev
=
tokenize
(
tweet_dev
)
own_corpus_dev_index_append
([
vocabulary_train
.
get
(
token_dev
,
0
)
for
token_dev
in
tokens_dev
])
#calculate vocabulary
vocabulary_size
=
len
(
tokenizer
.
word_index
)
+
1
print
(
type
(
own_corpus_dev_index_append
))
# load pre-trained word embeddings into an Embedding layer
embedding_matrix
=
np
.
zeros
((
vocabulary_size
,
EMBEDDING_DIM
))
for
word
,
index
in
tokenizer
.
word_index
.
items
():
for
word
,
index
in
vocabulary_train
.
items
():
word
=
word
.
lower
()
if
index
>
vocabulary_size
-
1
:
break
else
:
embedding_vector
=
glove
.
word_indexes
.
get
(
word
)
if
embedding_vector
is
not
None
:
embedding_matrix
[
index
]
=
embedding_vector
#max_len_input = 30
max_len_input
=
20
#integer encode the documents
encoded_docs
=
tokenizer
.
texts_to_sequences
(
tweets_train
)
train_features_pad
=
sequence
.
pad_sequences
(
corpus_train_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
dtype
=
type
(
corpus_train_index
[
0
][
0
]))
#pad documents to a max length of n words
padded_docs
=
pad_sequences
(
encoded_docs
,
maxlen
=
max_len_input
,
padding
=
'post'
)
# define RNN model
model
=
Sequential
()
e
=
Embedding
(
vocabulary_size
,
EMBEDDING_DIM
,
input_length
=
max_len_input
,
trainable
=
False
)
#input_length: Length of input sequences, when it is constant
e
=
Embedding
(
vocabulary_size
,
EMBEDDING_DIM
,
input_length
=
max_len_input
,
weights
=
[
embedding_matrix
],
trainable
=
False
)
model
.
add
(
e
)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model
.
add
(
LSTM
(
32
))
#model.add(Dense(32, activation='tanh'))
model
.
add
(
Dense
(
1
,
activation
=
'softmax'
))
model
.
add
(
Dense
(
len
(
CLASSES
),
activation
=
'softmax'
))
# compile the model
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'binary_crossentropy'
,
metrics
=
[
'acc'
])
# summarize the model
print
(
model
.
summary
())
# fit the model
print
(
len
(
padded_docs
))
print
(
len
(
tweets_train_labels_numeric
))
model
.
fit
(
padded_docs
,
tweets_train_labels_numeric
,
epochs
=
50
,
verbose
=
0
)
model
.
fit
(
train_features_pad
,
tweets_train_labels_numeric
,
batch_size
=
32
,
epochs
=
10
,
verbose
=
0
)
# evaluate the model
loss
,
accuracy
=
model
.
evaluate
(
padded_docs
,
tweets_train_labels_numeric
,
verbose
=
0
)
loss
,
accuracy
=
model
.
evaluate
(
train_features_pad
,
tweets_train_labels_numeric
,
verbose
=
0
)
print
(
'Accuracy:
%
f'
%
(
accuracy
*
100
))
padded_docs_dev
=
sequence
.
pad_sequences
(
corpus_dev_index
,
maxlen
=
max_len_input
,
padding
=
"post"
,
truncating
=
"post"
,
dtype
=
type
(
corpus_dev_index
[
0
][
0
]))
tweets_dev_classified_labels
=
model
.
predict_classes
(
padded_docs_dev
,
batch_size
=
32
,
verbose
=
1
)
return
tweets_dev_classified_labels
def
calculate_quality_performamnce
(
y_labels
,
y_classified_labels
,
model_name
):
classes_index
=
[
CLASSES
.
index
(
c
)
for
c
in
CLASSES
]
accruacy
=
metrics
.
accuracy_score
(
y_labels
,
y_classified_labels
)
macro_precision
=
metrics
.
precision_score
(
y_labels
,
y_classified_labels
,
labels
=
classes_index
,
average
=
"macro"
)
macro_recall
=
metrics
.
recall_score
(
y_labels
,
y_classified_labels
,
labels
=
classes_index
,
average
=
"macro"
)
macro_f1
=
metrics
.
f1_score
(
y_labels
,
y_classified_labels
,
labels
=
classes_index
,
average
=
"macro"
)
print
(
"
\n
*** Results "
+
model_name
+
" ***"
)
print
(
"Macro-Precision: "
+
str
(
macro_precision
))
print
(
"Macro-Recall: "
+
str
(
macro_recall
))
print
(
"Macro-F1: "
+
str
(
macro_f1
))
print
(
"Accuracy: "
+
str
(
accruacy
))
def
main
():
tweets_train
,
tweets_train_labels_numeric
,
tweets_dev
,
tweets_dev_labels
=
read_corpus
()
classification_embedings_rnn
(
tweets_train
,
tweets_train_labels_numeric
)
tweets_dev_classified_labels
=
classification_embedings_rnn
(
tweets_train
,
tweets_train_labels_numeric
,
tweets_dev
)
calculate_quality_performamnce
(
tweets_dev_labels
,
tweets_dev_classified_labels
,
"RNN_LSTM"
)
if
__name__
==
'__main__'
:
...
...
position_trigger_word.py
0 → 100644
View file @
7373f811
'''
Created on 20 jun. 2018
@author: fmplaza
'''
import
pandas
as
pd
from
nltk.tokenize
import
TweetTokenizer
import
statistics
as
s
tknzr
=
TweetTokenizer
()
def
read_corpus
():
labels
=
[]
tweets_train_labels_numeric
=
[]
tweets_train
=
pd
.
read_csv
(
'./corpus/train-v2.csv'
,
sep
=
"
\t
"
,
header
=
0
)
tweets_train_labels
=
tweets_train
[
'emotion'
]
tweets_dev
=
pd
.
read_csv
(
'./corpus/trial-v2.csv'
,
sep
=
"
\t
"
,
header
=
0
)
tweets_dev_labels
=
pd
.
read_csv
(
'./corpus/trial-v2.labels'
,
sep
=
"
\t
"
,
header
=
0
)
return
tweets_train
,
tweets_train_labels
,
tweets_dev
,
tweets_dev_labels
def
calculate_position
(
tweets_train
):
position_triggerword
=
[]
len_tweet
=
[]
cont
=
0
for
index
,
row
in
tweets_train
.
iterrows
():
text
=
tknzr
.
tokenize
(
row
[
'tweet'
])
if
'#TRIGGERWORD'
in
text
:
position
=
text
.
index
(
'#TRIGGERWORD'
)
position_triggerword
.
append
(
position
)
len_tweet
.
append
(
len
(
text
))
if
(
len
(
text
)
==
155
):
cont
=
cont
+
1
print
(
"Position trigger word"
)
print
(
"Max position: "
,
max
(
position_triggerword
))
print
(
"Mean position"
,
s
.
mean
(
position_triggerword
))
print
(
"Mode position"
,
s
.
mode
(
position_triggerword
))
print
(
"Lenght tweet"
)
print
(
"Mean lenght"
,
s
.
mean
(
len_tweet
))
print
(
"Mode lenght"
,
s
.
mode
(
len_tweet
))
def
main
():
tweets_train
,
tweets_train_labels
,
tweets_dev
,
tweets_dev_labels
=
read_corpus
()
calculate_position
(
tweets_train
)
if
__name__
==
'__main__'
:
main
()
pass
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment