EmojiTraining.py

# -*- coding: utf-8 -*-
"""Emoji Prediction.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1-nQ4fFiQsri48Cu7j1xAM8TCGr4CdS6j
"""

import numpy as np
import pandas as pd
import emoji
import itertools

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, SimpleRNN,LSTM, Activation
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

train = pd.read_csv('/content/train_emoji.csv',header=None)
test = pd.read_csv('/content/test_emoji.csv',header=None)

#train.head()

#test.head()

emoji_dict = { 0 : ":heart:", 1 : ":baseball:", 2 : ":smile:", 3 : ":disappointed:", 4 : ":fork_and_knife:"}

#for i in emoji_dict.keys():
  #print (i,end=" ")
  #print (emoji.emojize(emoji_dict[i], use_aliases=True))

#from sklearn.model_selection import train_test_split
#seed = 42

#X_train, X_test, Y_train, Y_test = train_test_split(train[0],train[1],test_size=0.33,random_state=seed)
#print(X_train.shape,Y_train.shape,X_test.shape,Y_test.shape)

# Creating training and testing data
X_train = train[0]
Y_train = train[1]

X_test = test[0]
Y_test = test[1]

#print (X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

for i in range(X_train.shape[0]):
  X_train[i] = X_train[i].split()

for j in range(X_test.shape[0]):
  X_test[j] = X_test[j].split()
    
Y_train = to_categorical(Y_train)

embeddings_index = {}
f = open('/content/glove.6B.50d.txt', encoding="utf-8")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

# stopwords
stopwords = []
stop = open('/content/stopwords.txt', encoding="utf-8")
for line in stop:
  stopwords.append(line.strip())
stop.close()

#print(stopwords)

#from scipy import spatial
# Checking cosine similarity of words happy and sad
#spatial.distance.cosine(embeddings_index["happy"], embeddings_index["sad"])

# Filling the embedding matrix
embedding_matrix_train = np.zeros((X_train.shape[0], 10, 50))
embedding_matrix_test = np.zeros((X_test.shape[0], 10, 50))

for i in range(X_train.shape[0]):
  for j in range(len(X_train[i])):
    if not (X_train[i][j].lower() in stopwords):
      embedding_matrix_train[i][j] = embeddings_index[X_train[i][j].lower()]
        
for i in range(X_test.shape[0]):
  for j in range(len(X_test[i])):
    if not (X_test[i][j].lower() in stopwords):
      embedding_matrix_test[i][j] = embeddings_index[X_test[i][j].lower()]

#print (embedding_matrix_train.shape, embedding_matrix_test.shape)

# A simple LSTM network
model = Sequential()
model.add(LSTM(128, input_shape=(10,50), return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(5))
model.add(Activation('softmax'))

#model.summary()

# Setting Loss ,Optimiser for model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training model
hist = model.fit(embedding_matrix_train,
                 Y_train,
                 epochs = 50, 
                 batch_size = 40,
                 shuffle=True)

# Prediction of trained model
pred = np.argmax(model.predict(embedding_matrix_test), axis=-1)
# pred = (model.predict(embedding_matrix_test) > 0.5).astype("int32")
# print(emoji.emojize(emoji_dict[pred[0]]))

"""### ACCURACY"""

# Calculating accuracy / score  of the model
float(sum(pred==Y_test))/embedding_matrix_test.shape[0]

# Printing the sentences with the predicted and the labelled emoji
#for i in range(embedding_matrix_test.shape[0]):
  #if pred[i] != Y_test[i]:
    #print(i)
    #print (test[0][i],end=" ")
    #print (emoji.emojize(emoji_dict[pred[i]], use_aliases=True),end=" ")
    #print (emoji.emojize(emoji_dict[Y_test[i]], use_aliases=True))

#epochs = 50
#plt.style.use("ggplot")
#plt.figure()
#plt.plot(np.arange(0, epochs), hist.history["loss"], label = "Train Loss")
#plt.plot(np.arange(0, epochs), hist.history["accuracy"], label = "Train Acc")

#plt.title("Loss and Accuracy plot")
#plt.xlabel("Epoch")
#plt.ylabel("Loss / Accuracy")
#plt.legend(loc = "lower left")
#plt.savefig("plot.jpg")