-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c2d6a5d
commit 3351f3a
Showing
9 changed files
with
781 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,3 +11,5 @@ | |
**/*.so | ||
data/ | ||
**/nohup.out | ||
**/*_wrapper.cpp | ||
**/simvecs* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
build: | ||
cd wsim && python setup.py build_ext --inplace | ||
|
||
clean: | ||
rm -rf wsim/build | ||
rm wsim/wsim_wrapper.cpp | ||
rm wsim/wsim.cpython*.so | ||
|
||
test: | ||
python test.py | ||
|
||
train: | ||
python train.py ../data/dict_hindi ../res/mapping_hindi.txt -ne 100 -nb 9999 -bs 32000 -lr 0.001 | ||
|
||
reset: | ||
rm -rf checkpoints/ | ||
rm -rf model.tf/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
six | ||
cython | ||
numpy | ||
tensorflow==2.2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from time import time | ||
from wsim.wsim import wsimdict as wd | ||
|
||
a = wd('../res/mapping_hindi.txt', '../data/dict_hindi') | ||
|
||
start_time = time() | ||
# r = a.top_similar('SIT', 10, wd.BIGRAM | wd.INSERT_BEG_END, 1) | ||
r = a.top_similar('समान', 20, wd.BIGRAM | wd.INSERT_BEG_END | wd.VOWEL_BUFF, 0.5) | ||
# r = a.similarity('WONDER', 'ASUNDER', wd.BIGRAM | wd.INSERT_BEG_END | wd.VOWEL_BUFF, 1) | ||
# r = a.random_scores(5, wd.BIGRAM | wd.INSERT_BEG_END | wd.VOWEL_BUFF, 0.4) | ||
# r = a.get_word(1) | ||
# r = a.get_index('उन्नाव') | ||
# r = [a.get_index(s.upper()) for s in words] | ||
print(f'time taken: {time() - start_time}') | ||
print(r) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
import os | ||
import argparse | ||
from wsim.wsim import wsimdict as wd | ||
from time import time | ||
import numpy as np | ||
from numpy.linalg import norm | ||
import tensorflow as tf | ||
# from tensorflow.keras.constraints import UnitNorm | ||
from random import randrange, random | ||
|
||
D_FLAGS = wd.BIGRAM | wd.INSERT_BEG_END | wd.VOWEL_BUFF | ||
D_PEN = 0.4 # penalty = 1/0.4 = 2.5 | ||
|
||
|
||
class Dictionary(): | ||
def data_generator(self): | ||
# n = int(len(self.dictionary)) | ||
for _ in range(self.num_of_batches): | ||
(i1, | ||
i2), s = self.dictionary.random_scores(self.batch_size, D_FLAGS, | ||
D_PEN) | ||
yield ((np.array(i1), np.array(i2)), np.array(s)) | ||
|
||
def __init__(self, params): | ||
self.dictionary = wd(params.mapping_path, params.dictionary_path) | ||
self.num_of_batches = params.num_of_batches | ||
self.batch_size = params.batch_size | ||
|
||
def dataset(self): | ||
return tf.data.Dataset.from_generator( | ||
self.data_generator, ((tf.int64, tf.int64), tf.float32), | ||
((tf.TensorShape([None]), tf.TensorShape( | ||
[None])), tf.TensorShape([None]))) | ||
|
||
|
||
class EmbeddingLayer(tf.keras.layers.Layer): | ||
def __init__(self, dictionary_size, vector_size): | ||
super(EmbeddingLayer, self).__init__() | ||
self.dictionary_size = dictionary_size | ||
self.vector_size = vector_size | ||
|
||
def build(self, input_shape): | ||
w_init = tf.random_normal_initializer() | ||
self.embedding = tf.Variable( | ||
initial_value=w_init( | ||
shape=(self.dictionary_size, self.vector_size), | ||
dtype=tf.float32), | ||
trainable=True) | ||
|
||
def call(self, input): | ||
x1, x2 = input | ||
X1 = tf.nn.embedding_lookup(self.embedding, x1, max_norm=1) | ||
X2 = tf.nn.embedding_lookup(self.embedding, x2, max_norm=1) | ||
return tf.reduce_sum(tf.multiply(X1, X2), axis=-1) | ||
|
||
|
||
class EmbeddingModel(tf.keras.Model): | ||
def __init__(self, dictionary, params): | ||
super(EmbeddingModel, self).__init__() | ||
self.embedding = EmbeddingLayer(len(dictionary), params.vector_size) | ||
|
||
@tf.function | ||
def call(self, inputs): | ||
return self.embedding(inputs) | ||
|
||
|
||
def _main(params): | ||
config(params) | ||
dictionary = Dictionary(params) | ||
dataset = dictionary.dataset() | ||
# def parse_fn(_): | ||
# return EmbeddingDataset(dictionary, params.num_of_batches // 4, params.batch_size) | ||
# dataset = tf.data.Dataset.range(4).interleave( | ||
# parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) | ||
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) | ||
|
||
model = EmbeddingModel( | ||
dictionary.dictionary, | ||
params) if params.load_model == '' else tf.keras.models.load_model( | ||
params.load_model) | ||
|
||
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse']) | ||
|
||
if os.path.isdir(params.checkpoint_path): | ||
model.load_weights(params.checkpoint_path) | ||
else: | ||
os.makedirs(params.checkpoint_path) | ||
|
||
cp_callback = tf.keras.callbacks.ModelCheckpoint( | ||
filepath=params.checkpoint_path, save_weights_only=True, verbose=1) | ||
# model.summary() | ||
for i in range(params.num_of_epochs): | ||
start = time() | ||
model.fit(dataset, callbacks=[cp_callback]) | ||
print(f'Epoch: {i}, time taken: {time() - start}') | ||
# model.embedding = tf.nn.l2_normalize(model.embedding, axis=-1) | ||
embedding = model.embedding.embedding.numpy() | ||
# model.embedding = norm(embedding, axis=0) | ||
model.save(params.save_model) | ||
check(dictionary.dictionary, embedding, params.check_size) | ||
export(dictionary.dictionary, embedding, params.save_embedding) | ||
|
||
|
||
def check(dict, embd, n): | ||
# actual, predicted = [], [] | ||
(i1, i2), actual = dict.random_scores(n, D_FLAGS, D_PEN) | ||
v1, v2 = embd[i1], embd[i2] | ||
n1, n2 = norm(v1, axis=1), norm(v2, axis=1) | ||
predicted = np.sum(v1 * v2, axis=1) / (n1 * n2) | ||
print(f'{n} MSE', np.square(actual - predicted).mean()) | ||
a = np.abs(actual - predicted) | ||
print(f'{n} Diff avg: {np.mean(a)}, minmax: {np.min(a)} - {np.max(a)}') | ||
print(f'{n} Embedding Norms min: {min(np.min(n1), np.min(n2))} max: {max(np.max(n1), np.max(n2))}') | ||
|
||
|
||
def export(dictionary, embedding, path): | ||
with open(path, 'w') as fout: | ||
for i, v in enumerate(embedding): | ||
word = dictionary.get_word(i) | ||
vector = ' '.join([str(x) for x in v]) | ||
fout.write(f'{word} {vector}\n') | ||
|
||
|
||
def config(params): | ||
gpus = tf.config.experimental.list_physical_devices('GPU') | ||
if gpus: | ||
try: | ||
# Currently, memory growth needs to be the same across GPUs | ||
for gpu in gpus: | ||
tf.config.experimental.set_memory_growth(gpu, True) | ||
logical_gpus = tf.config.experimental.list_logical_devices( | ||
'GPU') | ||
print(len(gpus), "Physical GPUs,", len(logical_gpus), | ||
"Logical GPUs") | ||
except RuntimeError as e: | ||
# Memory growth must be set before GPUs have been initialized | ||
print(e) | ||
|
||
|
||
def _args(): | ||
parser = argparse.ArgumentParser( | ||
prog=__file__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
parser.add_argument('dictionary_path', type=str) | ||
parser.add_argument('mapping_path', type=str) | ||
parser.add_argument('-bs', | ||
'--batch_size', | ||
type=int, | ||
default=4096, | ||
metavar='', | ||
help='batch size') | ||
parser.add_argument('-vs', | ||
'--vector_size', | ||
type=int, | ||
default=50, | ||
metavar='', | ||
help='embedding vector length') | ||
parser.add_argument('-nb', | ||
'--num_of_batches', | ||
type=int, | ||
default=100, | ||
metavar='', | ||
help='number_of_batches_per_epochs') | ||
parser.add_argument('-ne', | ||
'--num_of_epochs', | ||
type=int, | ||
default=1, | ||
metavar='', | ||
help='number_of_epochs') | ||
parser.add_argument('-lm', | ||
'--load_model', | ||
type=str, | ||
default='', | ||
metavar='', | ||
help='path to load_model') | ||
parser.add_argument('-sm', | ||
'--save_model', | ||
type=str, | ||
default='model.tf', | ||
metavar='', | ||
help='path to save_model') | ||
parser.add_argument('-se', | ||
'--save_embedding', | ||
type=str, | ||
default='simvecs_hindi', | ||
metavar='', | ||
help='path to save_embedding') | ||
parser.add_argument('-cp', | ||
'--checkpoint_path', | ||
type=str, | ||
default='checkpoints/', | ||
metavar='', | ||
help='checkpoint path') | ||
parser.add_argument('-cs', | ||
'--check_size', | ||
type=int, | ||
default=100000, | ||
metavar='', | ||
help='number of pairs to check after each epochs') | ||
return parser.parse_args() | ||
|
||
|
||
if __name__ == '__main__': | ||
params = _args() | ||
_main(params) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import sys | ||
# from distutils.core import setup, Extension | ||
from setuptools import setup | ||
from distutils.core import Extension | ||
# from Cython.Distutils import build_ext | ||
from Cython.Build import cythonize | ||
|
||
compile_args = ['-g', '-std=c++17'] | ||
link_args = [] | ||
|
||
if sys.platform == 'darwin': | ||
compile_args.append('-stdlib=libc++') | ||
compile_args.append('-mmacosx-version-min=10.9') | ||
else: | ||
compile_args.append('-std=gnu++17') | ||
# compile_args.append('-fopenmp') | ||
# link_args.append('-fopenmp') | ||
|
||
wsim = Extension('wsim', | ||
sources=['wsim_wrapper.pyx', 'wsim.cpp'], | ||
language="c++", | ||
extra_compile_args=compile_args, | ||
extra_link_args=link_args) | ||
|
||
setup( | ||
name='wsim', | ||
version='0.2.2', | ||
description='Computes similarity between word based on phoneme features', | ||
author='Rahul Sharma', | ||
author_email='rahulsrma26@gmail.com', | ||
# ext_modules=[wsim]) | ||
ext_modules=cythonize(wsim)) |
Oops, something went wrong.