Skip to content

Commit

Permalink
🔡 Added hindi embedding code
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulsrma26 committed Sep 26, 2020
1 parent c2d6a5d commit 3351f3a
Show file tree
Hide file tree
Showing 9 changed files with 781 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@
**/*.so
data/
**/nohup.out
**/*_wrapper.cpp
**/simvecs*
17 changes: 17 additions & 0 deletions embedding_hindi/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
build:
cd wsim && python setup.py build_ext --inplace

clean:
rm -rf wsim/build
rm wsim/wsim_wrapper.cpp
rm wsim/wsim.cpython*.so

test:
python test.py

train:
python train.py ../data/dict_hindi ../res/mapping_hindi.txt -ne 100 -nb 9999 -bs 32000 -lr 0.001

reset:
rm -rf checkpoints/
rm -rf model.tf/
4 changes: 4 additions & 0 deletions embedding_hindi/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
six
cython
numpy
tensorflow==2.2.0
15 changes: 15 additions & 0 deletions embedding_hindi/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from time import time
from wsim.wsim import wsimdict as wd

a = wd('../res/mapping_hindi.txt', '../data/dict_hindi')

start_time = time()
# r = a.top_similar('SIT', 10, wd.BIGRAM | wd.INSERT_BEG_END, 1)
r = a.top_similar('समान', 20, wd.BIGRAM | wd.INSERT_BEG_END | wd.VOWEL_BUFF, 0.5)
# r = a.similarity('WONDER', 'ASUNDER', wd.BIGRAM | wd.INSERT_BEG_END | wd.VOWEL_BUFF, 1)
# r = a.random_scores(5, wd.BIGRAM | wd.INSERT_BEG_END | wd.VOWEL_BUFF, 0.4)
# r = a.get_word(1)
# r = a.get_index('उन्नाव')
# r = [a.get_index(s.upper()) for s in words]
print(f'time taken: {time() - start_time}')
print(r)
204 changes: 204 additions & 0 deletions embedding_hindi/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import os
import argparse
from wsim.wsim import wsimdict as wd
from time import time
import numpy as np
from numpy.linalg import norm
import tensorflow as tf
# from tensorflow.keras.constraints import UnitNorm
from random import randrange, random

D_FLAGS = wd.BIGRAM | wd.INSERT_BEG_END | wd.VOWEL_BUFF
D_PEN = 0.4 # penalty = 1/0.4 = 2.5


class Dictionary():
def data_generator(self):
# n = int(len(self.dictionary))
for _ in range(self.num_of_batches):
(i1,
i2), s = self.dictionary.random_scores(self.batch_size, D_FLAGS,
D_PEN)
yield ((np.array(i1), np.array(i2)), np.array(s))

def __init__(self, params):
self.dictionary = wd(params.mapping_path, params.dictionary_path)
self.num_of_batches = params.num_of_batches
self.batch_size = params.batch_size

def dataset(self):
return tf.data.Dataset.from_generator(
self.data_generator, ((tf.int64, tf.int64), tf.float32),
((tf.TensorShape([None]), tf.TensorShape(
[None])), tf.TensorShape([None])))


class EmbeddingLayer(tf.keras.layers.Layer):
def __init__(self, dictionary_size, vector_size):
super(EmbeddingLayer, self).__init__()
self.dictionary_size = dictionary_size
self.vector_size = vector_size

def build(self, input_shape):
w_init = tf.random_normal_initializer()
self.embedding = tf.Variable(
initial_value=w_init(
shape=(self.dictionary_size, self.vector_size),
dtype=tf.float32),
trainable=True)

def call(self, input):
x1, x2 = input
X1 = tf.nn.embedding_lookup(self.embedding, x1, max_norm=1)
X2 = tf.nn.embedding_lookup(self.embedding, x2, max_norm=1)
return tf.reduce_sum(tf.multiply(X1, X2), axis=-1)


class EmbeddingModel(tf.keras.Model):
def __init__(self, dictionary, params):
super(EmbeddingModel, self).__init__()
self.embedding = EmbeddingLayer(len(dictionary), params.vector_size)

@tf.function
def call(self, inputs):
return self.embedding(inputs)


def _main(params):
config(params)
dictionary = Dictionary(params)
dataset = dictionary.dataset()
# def parse_fn(_):
# return EmbeddingDataset(dictionary, params.num_of_batches // 4, params.batch_size)
# dataset = tf.data.Dataset.range(4).interleave(
# parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

model = EmbeddingModel(
dictionary.dictionary,
params) if params.load_model == '' else tf.keras.models.load_model(
params.load_model)

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

if os.path.isdir(params.checkpoint_path):
model.load_weights(params.checkpoint_path)
else:
os.makedirs(params.checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=params.checkpoint_path, save_weights_only=True, verbose=1)
# model.summary()
for i in range(params.num_of_epochs):
start = time()
model.fit(dataset, callbacks=[cp_callback])
print(f'Epoch: {i}, time taken: {time() - start}')
# model.embedding = tf.nn.l2_normalize(model.embedding, axis=-1)
embedding = model.embedding.embedding.numpy()
# model.embedding = norm(embedding, axis=0)
model.save(params.save_model)
check(dictionary.dictionary, embedding, params.check_size)
export(dictionary.dictionary, embedding, params.save_embedding)


def check(dict, embd, n):
# actual, predicted = [], []
(i1, i2), actual = dict.random_scores(n, D_FLAGS, D_PEN)
v1, v2 = embd[i1], embd[i2]
n1, n2 = norm(v1, axis=1), norm(v2, axis=1)
predicted = np.sum(v1 * v2, axis=1) / (n1 * n2)
print(f'{n} MSE', np.square(actual - predicted).mean())
a = np.abs(actual - predicted)
print(f'{n} Diff avg: {np.mean(a)}, minmax: {np.min(a)} - {np.max(a)}')
print(f'{n} Embedding Norms min: {min(np.min(n1), np.min(n2))} max: {max(np.max(n1), np.max(n2))}')


def export(dictionary, embedding, path):
with open(path, 'w') as fout:
for i, v in enumerate(embedding):
word = dictionary.get_word(i)
vector = ' '.join([str(x) for x in v])
fout.write(f'{word} {vector}\n')


def config(params):
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices(
'GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus),
"Logical GPUs")
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)


def _args():
parser = argparse.ArgumentParser(
prog=__file__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('dictionary_path', type=str)
parser.add_argument('mapping_path', type=str)
parser.add_argument('-bs',
'--batch_size',
type=int,
default=4096,
metavar='',
help='batch size')
parser.add_argument('-vs',
'--vector_size',
type=int,
default=50,
metavar='',
help='embedding vector length')
parser.add_argument('-nb',
'--num_of_batches',
type=int,
default=100,
metavar='',
help='number_of_batches_per_epochs')
parser.add_argument('-ne',
'--num_of_epochs',
type=int,
default=1,
metavar='',
help='number_of_epochs')
parser.add_argument('-lm',
'--load_model',
type=str,
default='',
metavar='',
help='path to load_model')
parser.add_argument('-sm',
'--save_model',
type=str,
default='model.tf',
metavar='',
help='path to save_model')
parser.add_argument('-se',
'--save_embedding',
type=str,
default='simvecs_hindi',
metavar='',
help='path to save_embedding')
parser.add_argument('-cp',
'--checkpoint_path',
type=str,
default='checkpoints/',
metavar='',
help='checkpoint path')
parser.add_argument('-cs',
'--check_size',
type=int,
default=100000,
metavar='',
help='number of pairs to check after each epochs')
return parser.parse_args()


if __name__ == '__main__':
params = _args()
_main(params)
32 changes: 32 additions & 0 deletions embedding_hindi/wsim/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import sys
# from distutils.core import setup, Extension
from setuptools import setup
from distutils.core import Extension
# from Cython.Distutils import build_ext
from Cython.Build import cythonize

compile_args = ['-g', '-std=c++17']
link_args = []

if sys.platform == 'darwin':
compile_args.append('-stdlib=libc++')
compile_args.append('-mmacosx-version-min=10.9')
else:
compile_args.append('-std=gnu++17')
# compile_args.append('-fopenmp')
# link_args.append('-fopenmp')

wsim = Extension('wsim',
sources=['wsim_wrapper.pyx', 'wsim.cpp'],
language="c++",
extra_compile_args=compile_args,
extra_link_args=link_args)

setup(
name='wsim',
version='0.2.2',
description='Computes similarity between word based on phoneme features',
author='Rahul Sharma',
author_email='rahulsrma26@gmail.com',
# ext_modules=[wsim])
ext_modules=cythonize(wsim))
Loading

0 comments on commit 3351f3a

Please sign in to comment.