Merge pull request #1378 from parulsethi/wordrank_docs

Add docstrings for Wordrank
piskvorky · Jun 6, 2017 · 0e6f1b2 · 0e6f1b2
2 parents eefca37 + 0d3495c
commit 0e6f1b2
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 3 deletions.
diff --git a/docs/notebooks/Wordrank_comparisons.ipynb b/docs/notebooks/Wordrank_comparisons.ipynb
@@ -173,7 +173,7 @@
     "        \n",
     "    # Train using wordrank\n",
     "    output_file = '{:s}_wr'.format(output_name)\n",
-    "    output_dir = 'wordrank_model' # directory to save embeddings and metadata to\n",
+    "    output_dir = 'model' # directory to save embeddings and metadata to\n",
     "    if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))):\n",
     "        print('\\nTraining wordrank on {:s} corpus..'.format(corpus_file))\n",
     "        %time wr_model = Wordrank.train(WR_HOME, corpus_file, output_dir, **wr_params); wr_model\n",

diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py
@@ -8,7 +8,7 @@
 `Word2Vec` for that.
 
 Example:
->>> model = gensim.models.wrappers.Wordrank('/Users/dummy/wordrank', corpus_file='text8', out_name='wr_model')
+>>> model = gensim.models.wrappers.Wordrank.train('/Users/dummy/wordrank', corpus_file='text8', out_name='wr_model')
 >>> print model[word]  # prints vector for given words
 
 .. [1] https://bitbucket.org/shihaoji/wordrank/
@@ -47,12 +47,20 @@ class Wordrank(KeyedVectors):
     @classmethod
     def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
               sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100,
-              beta=99, loss='hinge', memory=4.0, cleanup_files=True, sorted_vocab=1, ensemble=0):
+              beta=99, loss='hinge', memory=4.0, cleanup_files=False, sorted_vocab=1, ensemble=0):
         """
+        The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory
+        which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code
+        available inside the wordrank directory. These files are used by the wordrank binary for training.
+
         `wr_path` is the path to the Wordrank directory.
         `corpus_file` is the filename of the text file to be used for training the Wordrank model.
         Expects file to contain space-separated tokens in a single line
         `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data.
+        It will contain following contents:
+            Word Embeddings saved after every dump_period and stored in a file model_word_"current iter".txt
+            Context Embeddings saved after every dump_period and stored in a file model_context_"current iter".txt
+            A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths
         `size` is the dimensionality of the feature vectors.
         `window` is the number of context words to the left (and to the right, if symmetric = 1).
         `symmetric` if 0, only use left context words, else use left and right both.