nlp_svc.py

# -*- coding: utf-8 -*-
"""NLP_SVC.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1TlBsTgEAnR2AjOKupMu8yVebmVOWbrEP
"""

# Importing Required Libraries

import nltk
import linecache
from itertools import islice
import re
import pandas as pd
import spacy
from spacy import displacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")
sp = spacy.load('en_core_web_sm')

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Some Required Data Structures

lines = []
line_dict = {}
relation_dict = {}
line_dataframe = pd.DataFrame()
line_test_dataframe = pd.DataFrame()
relation_test_dataframe = pd.DataFrame()
relation_dataframe = pd.DataFrame()
train_dataframe = pd.DataFrame()

# Creation of the Train Data

with open('/content/TRAIN_FILE.TXT') as f:
  for line in islice(f, 0, None, 4):
    lister = line.split('"')
    line_number = int(lister[0].split('\t')[0])
    line_dict[line_number] = lister[1]
  
with open('/content/TRAIN_FILE.TXT') as f:
  for i, line in enumerate(islice(f, 1, None, 4)):
    relation_dict[i] = line.split('\n')[0]

line_dataframe = pd.DataFrame.from_dict(line_dict, orient='index', columns=['line'])
line_dataframe = line_dataframe.reset_index()

relation_dataframe = pd.DataFrame.from_dict(relation_dict, orient='index', columns=['relation'])

line_dataframe['relation'] = relation_dataframe
line_dataframe = line_dataframe.drop(columns=['index'])

line_dataframe

# Creation of the Test Data

line_test_dict = {}
relation_test_dict = {}
with open('/content/TEST_FILE_FULL.TXT') as f:
  for line in islice(f, 0, None, 4):
    lister = line.split('"')
    line_number = int(lister[0].split('\t')[0])
    line_test_dict[line_number] = lister[1]
  
with open('/content/TEST_FILE_FULL.TXT') as f:
  for i, line in enumerate(islice(f, 1, None, 4)):
    relation_test_dict[i] = line.split('\n')[0]

line_test_dataframe = pd.DataFrame.from_dict(line_test_dict, orient='index', columns=['line'])
line_test_dataframe = line_test_dataframe.reset_index()

relation_test_dataframe = pd.DataFrame.from_dict(relation_test_dict, orient='index', columns=['relation'])

line_test_dataframe['relation'] = relation_test_dataframe
line_test_dataframe = line_test_dataframe.drop(columns=['index'])

"""# Train Dataframe is in line_dataframe

# Test Dataframe is in line_test_dataframe

# Tokenization
"""

# Adding a column of tokens to the dataframe

tokenize_dict = {}
tok_dict = {}
for key, val in line_dict.items():
  tokenize_dict[key] = nltk.word_tokenize(val)

for key, val in tokenize_dict.items():
  l = []
  for i in range(len(val)):
    if val[i] == '<':
      val[i] = ''.join(val[i:i+3])
  
    l = [e for e in val if e not in ('e1', 'e2', '/e1', '/e2', '>')]
    tokenize_dict[key] = ', '.join(str(s) for s in l)
    tok_dict[key] = l

tokenize_dataframe = pd.DataFrame.from_dict(tokenize_dict, orient='index', columns=['token'])
tokenize_dataframe = tokenize_dataframe.reset_index()

line_dataframe['tokens'] = tokenize_dataframe['token']

"""# POS And Dep Parse"""

pos_dict = {}
dep_dict = {}
p = []
d = []
for key, val in tokenize_dict.items():
  s = sp(''.join(val).replace(',', ''))
  for word in s:
    p.append(word.pos_)
    d.append(word.dep_)
  pos_dict[key] = ', '.join(str(s) for s in p)
  dep_dict[key] = ', '.join(str(s) for s in d)
  p = []
  d = []

pos_dataframe = pd.DataFrame.from_dict(pos_dict, orient='index', columns=['pos'])
pos_dataframe = pos_dataframe.reset_index()

dep_dataframe = pd.DataFrame.from_dict(dep_dict, orient='index', columns=['dep'])
dep_dataframe = dep_dataframe.reset_index()

line_dataframe['pos'] = pos_dataframe['pos']
line_dataframe['dep'] = dep_dataframe['dep']

"""# POS (Not Needed)"""

# pos_dict = tok_dict.copy()

# for key, val in pos_dict.items():
#   l = nltk.pos_tag(val)
#   pos_dict[key] = ', '.join(str(ele) for ele in l)

# pos_dataframe = pd.DataFrame.from_dict(pos_dict, orient='index', columns=['pos'])
# line_dataframe['tokens'] = pos_dataframe['pos']

"""# NER"""

line_dataframe['entities'] = line_dataframe['line']
entity_dict = {}
for i, val in enumerate(line_dataframe['entities']):
  e1 = re.findall('<e1>(.*?)</e1>', val)
  e2 = re.findall('<e2>(.*?)</e2>', val)
  entity_dict[i+1] = (str(e1), str(e2))

entity_dataframe = pd.DataFrame.from_dict(entity_dict, orient='index', columns=['e1', 'e2'])
entity_dataframe = entity_dataframe.reset_index()

line_dataframe['e1'] = entity_dataframe['e1']
line_dataframe['e2'] = entity_dataframe['e2']
line_dataframe.drop(['entities'], axis=1)

"""# Before We Train the model, We'll now do these transformations on the Test Data Too

## Test Data Transformations
"""

# NER

line_test_dataframe['entities'] = line_test_dataframe['line']
entity_test_dict = {}
for i, val in enumerate(line_test_dataframe['entities']):
  e1 = re.findall('<e1>(.*?)</e1>', val)
  e2 = re.findall('<e2>(.*?)</e2>', val)
  entity_test_dict[i] = (str(e1), str(e2))

entity_test_dataframe = pd.DataFrame.from_dict(entity_test_dict, orient='index', columns=['e1', 'e2'])
entity_test_dataframe = entity_test_dataframe.reset_index()

line_test_dataframe['e1'] = entity_test_dataframe['e1']
line_test_dataframe['e2'] = entity_test_dataframe['e2']
line_test_dataframe = line_test_dataframe.drop(columns=['entities'])

line_test_dataframe

# Tokenize and POS and DEP for TEST


tokenize_test_dict = {}

for key, val in line_test_dict.items():
  tokenize_test_dict[key] = nltk.word_tokenize(val)

for key, val in tokenize_test_dict.items():
  l = []
  for i in range(len(val)):
    if val[i] == '<':
      val[i] = ''.join(val[i:i+3])
  
    l = [e for e in val if e not in ('e1', 'e2', '/e1', '/e2', '>')]
    tokenize_test_dict[key] = ', '.join(str(s) for s in l)


tokenize_test_dataframe = pd.DataFrame.from_dict(tokenize_test_dict, orient='index', columns=['token'])
tokenize_test_dataframe = tokenize_test_dataframe.reset_index()

line_test_dataframe['tokens'] = tokenize_test_dataframe['token']

# POS AND DEP FOR TEST

pos_test_dict = {}
dep_test_dict = {}
p = []
d = []
for key, val in tokenize_test_dict.items():
  s = sp(''.join(val).replace(',', ''))
  for word in s:
    p.append(word.pos_)
    d.append(word.dep_)
  pos_test_dict[key] = ', '.join(str(s) for s in p)
  dep_test_dict[key] = ', '.join(str(s) for s in d)
  p = []
  d = []

pos_test_dataframe = pd.DataFrame.from_dict(pos_test_dict, orient='index', columns=['pos'])
pos_test_dataframe = pos_test_dataframe.reset_index()

dep_test_dataframe = pd.DataFrame.from_dict(dep_test_dict, orient='index', columns=['dep'])
dep_test_dataframe = dep_test_dataframe.reset_index()

line_test_dataframe['pos'] = pos_test_dataframe['pos']
line_test_dataframe['dep'] = dep_test_dataframe['dep']

"""# Build the Model"""

!pip install sklearn-pandas

# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(line_dataframe, line_dataframe['relation'], test_size=0.25)

# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report

# clf = LogisticRegression(C=1e5)
# clf.fit(x_train, y_train)
# y_pred = clf.predict(x_test)
# print(classification_report(y_test, y_pred))

# Importing Libraries for SVC

from sklearn_pandas import DataFrameMapper, gen_features
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

# Enter the column names of the features needed, Can be changed

cat_features = ["pos", "dep"]
gf = gen_features(cat_features, [HashingVectorizer])
mapper = DataFrameMapper(gf)

# Change this Slice for better training, Here it is 500 for Colab Limits

line_dataframe = line_dataframe.iloc[0:200, :]
# line_dataframe = line_dataframe.iloc[0:300, :]

cat_features_transformed = mapper.fit_transform(line_dataframe)

target_name_encoded = LabelEncoder().fit_transform(line_dataframe['relation'])

model = SVC(kernel = "linear", probability=True)
model.fit(cat_features_transformed, target_name_encoded)
SVC(C=1.0, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

### For test/prediction part ###

# test_features_transformed = mapper.transform(df_test)
# predictions = model.predict(test_features_transformed)

model

line_test_dataframe = line_test_dataframe.iloc[0:200, :]
line_test_dataframe = line_test_dataframe.drop(columns=['relation'])

test_features_transformed = mapper.transform(line_test_dataframe)
predictions = model.predict(test_features_transformed)

# predictions

line_test_dataframe['prediction'] = predictions

line_test_dataframe.head(20)

line_dataframe['relation'] = pd.Categorical(line_dataframe['relation'])
line_dataframe["relation"] = line_dataframe["relation"].cat.codes