Segunda aproximación programa RNN

parent dbb12602
,fmplaza,SINAI-155-1,18.06.2018 19:22,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
......@@ -4,6 +4,8 @@ Created on 20 jun. 2018
@author: fmplaza
'''
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
from model.glove_word_embedings import GloveWordEmbednigs
import pandas as pd
from nltk.tokenize.casual import TweetTokenizer
......@@ -15,88 +17,163 @@ from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from mpl_toolkits.axes_grid1.axes_size import Padded
from keras.utils import np_utils
from sklearn import metrics
from nltk.tokenize.casual import TweetTokenizer
from keras.preprocessing import sequence
TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
CLASSES = []
EMBEDDING_DIM = 200
#load GloVe model
glove = GloveWordEmbednigs()
glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove.path_file = glove_file
#Load the Glove vectors file into memory, 2 index reserved
glove.load(2)
#Load the WASSA corpus
def read_corpus():
labels = []
classes_append = CLASSES.append
tweets_train_labels_numeric = []
tweets_dev_labels_numeric = []
tweets_train = pd.read_csv('./corpus/train-v2.csv', sep="\t", header=0)
tweets_train_labels = tweets_train['emotion']
tweets_dev = pd.read_csv('./corpus/trial-v2.csv', sep="\t", header=0)
tweets_dev_labels = pd.read_csv('./corpus/trial-v2.labels', sep="\t", header=0)
tweets_dev_labels = tweets_dev_labels['emotion']
#convert categorical labels into numeric labels
for label in tweets_train_labels.tolist():
if(label not in CLASSES):
classes_append(label)
tweets_train_labels_numeric.append(CLASSES.index(label))
for label in tweets_dev_labels.tolist():
tweets_dev_labels_numeric.append(CLASSES.index(label))
tweets_train_labels_numeric = np_utils.to_categorical(tweets_train_labels_numeric)
return tweets_train.tweet, tweets_train_labels_numeric, tweets_dev.tweet, tweets_dev_labels_numeric
def tokenize(text):
#preprocessing data
text_tokenized = TWEET_TOKENIZER.tokenize(text)
return text_tokenized
return tweets_train.tweet, tweets_train_labels_numeric, tweets_dev.tweet, tweets_dev_labels
#Classification with RNN and embedings (pre-trained)
def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric):
def fit_transform_vocabulary(corpus):
#generate vocabulary of corpus
#load the whole embedding into memory, 2 index reserved
glove.load(2)
vocabulary = {}
corpus_indexes = []
index = 1
for doc in corpus:
doc_indexes = []
tokens = tokenize(doc)
for token in tokens:
if token not in vocabulary:
vocabulary[token] = index
doc_indexes.append(index)
index += 1
else:
doc_indexes.append(vocabulary[token])
corpus_indexes.append(doc_indexes)
return (vocabulary, corpus_indexes)
def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, tweets_dev):
#Classification with RNN and embedings (pre-trained)
#calculate vocabulary
vocabulary_train, corpus_train_index = fit_transform_vocabulary(tweets_train)
vocabulary_size = len(vocabulary_train) + 1
max_len_input = int(np.average([len(tweet_train) for tweet_train in corpus_train_index], 0))
#preprocessing tweets train
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets_train)
#calculate index vocabulary in corpus dev
corpus_dev_index = []
own_corpus_dev_index_append = corpus_dev_index.append
for tweet_dev in tweets_dev:
tokens_dev = tokenize(tweet_dev)
own_corpus_dev_index_append([vocabulary_train.get(token_dev, 0) for token_dev in tokens_dev])
#calculate vocabulary
vocabulary_size = len(tokenizer.word_index) + 1
print(type(own_corpus_dev_index_append))
# load pre-trained word embeddings into an Embedding layer
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, index in tokenizer.word_index.items():
for word, index in vocabulary_train.items():
word = word.lower()
if index > vocabulary_size - 1:
break
else:
embedding_vector = glove.word_indexes.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
#max_len_input = 30
max_len_input = 20
#integer encode the documents
encoded_docs = tokenizer.texts_to_sequences(tweets_train)
train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_train_index[0][0]))
#pad documents to a max length of n words
padded_docs = pad_sequences(encoded_docs, maxlen=max_len_input, padding='post')
# define RNN model
model = Sequential()
e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, trainable=False)
#input_length: Length of input sequences, when it is constant
e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
model.add(e)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model.add(LSTM(32))
#model.add(Dense(32, activation='tanh'))
model.add(Dense(1, activation='softmax'))
model.add(Dense(len(CLASSES), activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
print(len(padded_docs))
print(len(tweets_train_labels_numeric))
model.fit(padded_docs, tweets_train_labels_numeric, epochs=50, verbose=0)
model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=10, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, tweets_train_labels_numeric, verbose=0)
loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, verbose=0)
print('Accuracy: %f' % (accuracy*100))
padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_dev_index[0][0]))
tweets_dev_classified_labels = model.predict_classes(padded_docs_dev, batch_size=32, verbose=1)
return tweets_dev_classified_labels
def calculate_quality_performamnce(y_labels, y_classified_labels, model_name):
classes_index = [CLASSES.index(c) for c in CLASSES]
accruacy = metrics.accuracy_score(y_labels, y_classified_labels)
macro_precision = metrics.precision_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
macro_recall = metrics.recall_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
macro_f1 = metrics.f1_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
print("\n*** Results " + model_name + " ***")
print("Macro-Precision: " + str(macro_precision))
print("Macro-Recall: " + str(macro_recall))
print("Macro-F1: " + str(macro_f1))
print("Accuracy: " + str(accruacy))
def main ():
tweets_train, tweets_train_labels_numeric, tweets_dev, tweets_dev_labels = read_corpus()
classification_embedings_rnn(tweets_train, tweets_train_labels_numeric)
tweets_dev_classified_labels = classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, tweets_dev)
calculate_quality_performamnce(tweets_dev_labels, tweets_dev_classified_labels, "RNN_LSTM")
if __name__ == '__main__':
......
'''
Created on 20 jun. 2018
@author: fmplaza
'''
import pandas as pd
from nltk.tokenize import TweetTokenizer
import statistics as s
tknzr = TweetTokenizer()
def read_corpus():
labels = []
tweets_train_labels_numeric = []
tweets_train = pd.read_csv('./corpus/train-v2.csv', sep="\t", header=0)
tweets_train_labels = tweets_train['emotion']
tweets_dev = pd.read_csv('./corpus/trial-v2.csv', sep="\t", header=0)
tweets_dev_labels = pd.read_csv('./corpus/trial-v2.labels', sep="\t", header=0)
return tweets_train, tweets_train_labels, tweets_dev, tweets_dev_labels
def calculate_position(tweets_train):
position_triggerword = []
len_tweet = []
cont = 0
for index, row in tweets_train.iterrows():
text = tknzr.tokenize(row['tweet'])
if '#TRIGGERWORD' in text:
position = text.index('#TRIGGERWORD')
position_triggerword.append(position)
len_tweet.append(len(text))
if(len(text) == 155):
cont = cont + 1
print("Position trigger word")
print("Max position: ", max(position_triggerword))
print("Mean position" , s.mean(position_triggerword))
print("Mode position", s.mode(position_triggerword))
print("Lenght tweet")
print("Mean lenght", s.mean(len_tweet))
print("Mode lenght", s.mode(len_tweet))
def main():
tweets_train, tweets_train_labels, tweets_dev, tweets_dev_labels = read_corpus()
calculate_position(tweets_train)
if __name__ == '__main__':
main()
pass
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment