Segunda aproximación programa RNN

parent dbb12602
,fmplaza,SINAI-155-1,18.06.2018 19:22,file:///home/fmplaza/.config/libreoffice/4;
\ No newline at end of file
...@@ -4,6 +4,8 @@ Created on 20 jun. 2018 ...@@ -4,6 +4,8 @@ Created on 20 jun. 2018
@author: fmplaza @author: fmplaza
''' '''
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
from model.glove_word_embedings import GloveWordEmbednigs from model.glove_word_embedings import GloveWordEmbednigs
import pandas as pd import pandas as pd
from nltk.tokenize.casual import TweetTokenizer from nltk.tokenize.casual import TweetTokenizer
...@@ -15,50 +17,102 @@ from keras.layers import Dense ...@@ -15,50 +17,102 @@ from keras.layers import Dense
from keras.layers import LSTM from keras.layers import LSTM
from keras.layers import Embedding from keras.layers import Embedding
from mpl_toolkits.axes_grid1.axes_size import Padded from mpl_toolkits.axes_grid1.axes_size import Padded
from keras.utils import np_utils
from sklearn import metrics
from nltk.tokenize.casual import TweetTokenizer
from keras.preprocessing import sequence
TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False) TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
CLASSES = [] CLASSES = []
EMBEDDING_DIM = 200 EMBEDDING_DIM = 200
#load GloVe model
glove = GloveWordEmbednigs() glove = GloveWordEmbednigs()
glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt" glove_file = "./embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt"
glove.path_file = glove_file glove.path_file = glove_file
#Load the Glove vectors file into memory, 2 index reserved
glove.load(2)
#Load the WASSA corpus #Load the WASSA corpus
def read_corpus(): def read_corpus():
labels = []
classes_append = CLASSES.append classes_append = CLASSES.append
tweets_train_labels_numeric = [] tweets_train_labels_numeric = []
tweets_dev_labels_numeric = []
tweets_train = pd.read_csv('./corpus/train-v2.csv', sep="\t", header=0) tweets_train = pd.read_csv('./corpus/train-v2.csv', sep="\t", header=0)
tweets_train_labels = tweets_train['emotion'] tweets_train_labels = tweets_train['emotion']
tweets_dev = pd.read_csv('./corpus/trial-v2.csv', sep="\t", header=0) tweets_dev = pd.read_csv('./corpus/trial-v2.csv', sep="\t", header=0)
tweets_dev_labels = pd.read_csv('./corpus/trial-v2.labels', sep="\t", header=0) tweets_dev_labels = pd.read_csv('./corpus/trial-v2.labels', sep="\t", header=0)
tweets_dev_labels = tweets_dev_labels['emotion']
#convert categorical labels into numeric labels
for label in tweets_train_labels.tolist(): for label in tweets_train_labels.tolist():
if(label not in CLASSES): if(label not in CLASSES):
classes_append(label) classes_append(label)
tweets_train_labels_numeric.append(CLASSES.index(label)) tweets_train_labels_numeric.append(CLASSES.index(label))
return tweets_train.tweet, tweets_train_labels_numeric, tweets_dev.tweet, tweets_dev_labels for label in tweets_dev_labels.tolist():
tweets_dev_labels_numeric.append(CLASSES.index(label))
tweets_train_labels_numeric = np_utils.to_categorical(tweets_train_labels_numeric)
return tweets_train.tweet, tweets_train_labels_numeric, tweets_dev.tweet, tweets_dev_labels_numeric
def tokenize(text):
#preprocessing data
#Classification with RNN and embedings (pre-trained) text_tokenized = TWEET_TOKENIZER.tokenize(text)
def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric):
#load the whole embedding into memory, 2 index reserved return text_tokenized
glove.load(2)
#preprocessing tweets train
tokenizer = Tokenizer() def fit_transform_vocabulary(corpus):
tokenizer.fit_on_texts(tweets_train) #generate vocabulary of corpus
vocabulary = {}
corpus_indexes = []
index = 1
for doc in corpus:
doc_indexes = []
tokens = tokenize(doc)
for token in tokens:
if token not in vocabulary:
vocabulary[token] = index
doc_indexes.append(index)
index += 1
else:
doc_indexes.append(vocabulary[token])
corpus_indexes.append(doc_indexes)
return (vocabulary, corpus_indexes)
def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, tweets_dev):
#Classification with RNN and embedings (pre-trained)
#calculate vocabulary #calculate vocabulary
vocabulary_size = len(tokenizer.word_index) + 1 vocabulary_train, corpus_train_index = fit_transform_vocabulary(tweets_train)
vocabulary_size = len(vocabulary_train) + 1
max_len_input = int(np.average([len(tweet_train) for tweet_train in corpus_train_index], 0))
#calculate index vocabulary in corpus dev
corpus_dev_index = []
own_corpus_dev_index_append = corpus_dev_index.append
for tweet_dev in tweets_dev:
tokens_dev = tokenize(tweet_dev)
own_corpus_dev_index_append([vocabulary_train.get(token_dev, 0) for token_dev in tokens_dev])
print(type(own_corpus_dev_index_append))
# load pre-trained word embeddings into an Embedding layer # load pre-trained word embeddings into an Embedding layer
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM)) embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, index in tokenizer.word_index.items(): for word, index in vocabulary_train.items():
word = word.lower()
if index > vocabulary_size - 1: if index > vocabulary_size - 1:
break break
else: else:
...@@ -66,37 +120,60 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric): ...@@ -66,37 +120,60 @@ def classification_embedings_rnn(tweets_train, tweets_train_labels_numeric):
if embedding_vector is not None: if embedding_vector is not None:
embedding_matrix[index] = embedding_vector embedding_matrix[index] = embedding_vector
max_len_input = 20 #max_len_input = 30
#integer encode the documents train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_train_index[0][0]))
encoded_docs = tokenizer.texts_to_sequences(tweets_train)
#pad documents to a max length of n words
padded_docs = pad_sequences(encoded_docs, maxlen=max_len_input, padding='post')
# define RNN model # define RNN model
model = Sequential() model = Sequential()
e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, trainable=False)
#input_length: Length of input sequences, when it is constant
e = Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_len_input, weights=[embedding_matrix], trainable=False)
model.add(e) model.add(e)
#number of features:_32 each vector of 200 dim is converted to a vector of 32 dim
model.add(LSTM(32)) model.add(LSTM(32))
#model.add(Dense(32, activation='tanh')) #model.add(Dense(32, activation='tanh'))
model.add(Dense(1, activation='softmax')) model.add(Dense(len(CLASSES), activation='softmax'))
# compile the model # compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model # summarize the model
print(model.summary()) print(model.summary())
# fit the model # fit the model
print(len(padded_docs)) model.fit(train_features_pad, tweets_train_labels_numeric, batch_size=32, epochs=10, verbose=0)
print(len(tweets_train_labels_numeric))
model.fit(padded_docs, tweets_train_labels_numeric, epochs=50, verbose=0)
# evaluate the model # evaluate the model
loss, accuracy = model.evaluate(padded_docs, tweets_train_labels_numeric, verbose=0) loss, accuracy = model.evaluate(train_features_pad, tweets_train_labels_numeric, verbose=0)
print('Accuracy: %f' % (accuracy*100)) print('Accuracy: %f' % (accuracy*100))
padded_docs_dev = sequence.pad_sequences(corpus_dev_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_dev_index[0][0]))
tweets_dev_classified_labels = model.predict_classes(padded_docs_dev, batch_size=32, verbose=1)
return tweets_dev_classified_labels
def calculate_quality_performamnce(y_labels, y_classified_labels, model_name):
classes_index = [CLASSES.index(c) for c in CLASSES]
accruacy = metrics.accuracy_score(y_labels, y_classified_labels)
macro_precision = metrics.precision_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
macro_recall = metrics.recall_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
macro_f1 = metrics.f1_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
print("\n*** Results " + model_name + " ***")
print("Macro-Precision: " + str(macro_precision))
print("Macro-Recall: " + str(macro_recall))
print("Macro-F1: " + str(macro_f1))
print("Accuracy: " + str(accruacy))
def main (): def main ():
tweets_train, tweets_train_labels_numeric, tweets_dev, tweets_dev_labels = read_corpus() tweets_train, tweets_train_labels_numeric, tweets_dev, tweets_dev_labels = read_corpus()
classification_embedings_rnn(tweets_train, tweets_train_labels_numeric) tweets_dev_classified_labels = classification_embedings_rnn(tweets_train, tweets_train_labels_numeric, tweets_dev)
calculate_quality_performamnce(tweets_dev_labels, tweets_dev_classified_labels, "RNN_LSTM")
if __name__ == '__main__': if __name__ == '__main__':
......
'''
Created on 20 jun. 2018
@author: fmplaza
'''
import pandas as pd
from nltk.tokenize import TweetTokenizer
import statistics as s
tknzr = TweetTokenizer()
def read_corpus():
labels = []
tweets_train_labels_numeric = []
tweets_train = pd.read_csv('./corpus/train-v2.csv', sep="\t", header=0)
tweets_train_labels = tweets_train['emotion']
tweets_dev = pd.read_csv('./corpus/trial-v2.csv', sep="\t", header=0)
tweets_dev_labels = pd.read_csv('./corpus/trial-v2.labels', sep="\t", header=0)
return tweets_train, tweets_train_labels, tweets_dev, tweets_dev_labels
def calculate_position(tweets_train):
position_triggerword = []
len_tweet = []
cont = 0
for index, row in tweets_train.iterrows():
text = tknzr.tokenize(row['tweet'])
if '#TRIGGERWORD' in text:
position = text.index('#TRIGGERWORD')
position_triggerword.append(position)
len_tweet.append(len(text))
if(len(text) == 155):
cont = cont + 1
print("Position trigger word")
print("Max position: ", max(position_triggerword))
print("Mean position" , s.mean(position_triggerword))
print("Mode position", s.mode(position_triggerword))
print("Lenght tweet")
print("Mean lenght", s.mean(len_tweet))
print("Mode lenght", s.mode(len_tweet))
def main():
tweets_train, tweets_train_labels, tweets_dev, tweets_dev_labels = read_corpus()
calculate_position(tweets_train)
if __name__ == '__main__':
main()
pass
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment