From b0b66d2fa0804e23f8a92c48912052cea74093c4 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Thu, 1 Jun 2017 13:50:44 +0530 Subject: [PATCH 1/6] added docstring for train method --- gensim/models/wrappers/wordrank.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index efeb020199..7cfbfca266 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -47,8 +47,12 @@ class Wordrank(KeyedVectors): @classmethod def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, - beta=99, loss='hinge', memory=4.0, cleanup_files=True, sorted_vocab=1, ensemble=0): + beta=99, loss='hinge', memory=4.0, cleanup_files=False, sorted_vocab=1, ensemble=0): """ + The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory + which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code + available inside the wordrank directory. These files are used by the wordrank binary for training. + `wr_path` is the path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line From bffdee673e53fc310a47cfe517f9e7ebee778229 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Thu, 1 Jun 2017 16:28:12 +0530 Subject: [PATCH 2/6] fix flake8 error --- gensim/models/wrappers/wordrank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 7cfbfca266..82ac428c33 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -51,7 +51,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, """ The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code - available inside the wordrank directory. These files are used by the wordrank binary for training. + available inside the wordrank directory. These files are used by the wordrank binary for training. `wr_path` is the path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. From 251b6e72b8fe02669cad7e566c6f1e2da54a928a Mon Sep 17 00:00:00 2001 From: parulsethi Date: Thu, 1 Jun 2017 21:29:01 +0530 Subject: [PATCH 3/6] fix flake8 error --- gensim/models/wrappers/wordrank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 82ac428c33..89d5a933b7 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -52,7 +52,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code available inside the wordrank directory. These files are used by the wordrank binary for training. - + `wr_path` is the path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line From fa424e5f254666cd45ad50da56aaa86bf42bc6f8 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Thu, 1 Jun 2017 21:30:43 +0530 Subject: [PATCH 4/6] fix flake8 error --- gensim/models/wrappers/wordrank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 89d5a933b7..82ac428c33 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -52,7 +52,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code available inside the wordrank directory. These files are used by the wordrank binary for training. - + `wr_path` is the path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line From e28e6bbf06f3bc85defc4f5dd75e57351991de54 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Sat, 3 Jun 2017 03:46:33 +0530 Subject: [PATCH 5/6] fix doc --- docs/notebooks/Wordrank_comparisons.ipynb | 2 +- gensim/models/wrappers/wordrank.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/notebooks/Wordrank_comparisons.ipynb b/docs/notebooks/Wordrank_comparisons.ipynb index 2aecbd3911..61ddf99756 100644 --- a/docs/notebooks/Wordrank_comparisons.ipynb +++ b/docs/notebooks/Wordrank_comparisons.ipynb @@ -173,7 +173,7 @@ " \n", " # Train using wordrank\n", " output_file = '{:s}_wr'.format(output_name)\n", - " output_dir = 'wordrank_model' # directory to save embeddings and metadata to\n", + " output_dir = 'model' # directory to save embeddings and metadata to\n", " if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))):\n", " print('\\nTraining wordrank on {:s} corpus..'.format(corpus_file))\n", " %time wr_model = Wordrank.train(WR_HOME, corpus_file, output_dir, **wr_params); wr_model\n", diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 82ac428c33..b8013addd2 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -8,7 +8,7 @@ `Word2Vec` for that. Example: ->>> model = gensim.models.wrappers.Wordrank('/Users/dummy/wordrank', corpus_file='text8', out_name='wr_model') +>>> model = gensim.models.wrappers.Wordrank.train('/Users/dummy/wordrank', corpus_file='text8', out_name='wr_model') >>> print model[word] # prints vector for given words .. [1] https://bitbucket.org/shihaoji/wordrank/ From 0d3495cb45238c52c135687115122bec9f10f9c6 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Tue, 6 Jun 2017 01:16:09 +0530 Subject: [PATCH 6/6] added info for content generated by wordrank --- gensim/models/wrappers/wordrank.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index b8013addd2..356be3051c 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -57,6 +57,10 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. + It will contain following contents: + Word Embeddings saved after every dump_period and stored in a file model_word_"current iter".txt + Context Embeddings saved after every dump_period and stored in a file model_context_"current iter".txt + A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both.