Skip to content

Commit

Permalink
Refactor docstrings for gensim.scripts. Partial fix #1665 (#1792)
Browse files Browse the repository at this point in the history
* drafts docstrings

* docstrings added, Ivan need to check format

* fixed links, headers, typos and etc

* docstrings moved to scripts inside and appear as console output in docs. setup.py and conf.py need to add sphinxcontrib.programoutput as new extension

* update configs with new extension sphinxcontrib.programoutput

* fix doc[1]

* finish with glove2word2vec (RawFormatter + examples)

* more examples for glove2word2vec

* simplify example for g2w2v + fix for w2v2t

* fix segment_wiki

* revert make_wikicorpus (should be fixed in refactoring)

* revert [2]
  • Loading branch information
yurkai authored and menshikh-iv committed Feb 9, 2018
1 parent f4354ce commit 117d447
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 82 deletions.
2 changes: 1 addition & 1 deletion docs/src/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon', 'sphinx.ext.imgmath']
extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon', 'sphinx.ext.imgmath', 'sphinxcontrib.programoutput']
autoclass_content = "both"

# Add any paths that contain templates here, relative to this directory.
Expand Down
101 changes: 79 additions & 22 deletions gensim/scripts/glove2word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,56 @@
# Copyright (C) 2016 Manas Ranjan Kar <manasrkar91@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
USAGE:
$ python -m gensim.scripts.glove2word2vec --input <GloVe vector file> --output <Word2vec vector file>

Where:
"""This script allows to convert GloVe vectors into the word2vec. Both files are
presented in text format and almost identical except that word2vec includes
number of vectors and its dimension which is only difference regard to GloVe.
* <GloVe vector file>: Input GloVe .txt file.
* <Word2vec vector file>: Desired name of output Word2vec .txt file.
Notes
-----
This script is used to convert GloVe vectors in text format into the word2vec text format.
The only difference between the two formats is an extra header line in word2vec,
which contains the number of vectors and their dimensionality (two integers).
"""
GloVe format (real example can be founded `on Stanford size <https://nlp.stanford.edu/projects/glove/>`_) ::
word1 0.123 0.134 0.532 0.152
word2 0.934 0.412 0.532 0.159
word3 0.334 0.241 0.324 0.188
...
word9 0.334 0.241 0.324 0.188
Word2Vec format (real example can be founded `on w2v old repository <https://code.google.com/archive/p/word2vec/>`_) ::
9 4
word1 0.123 0.134 0.532 0.152
word2 0.934 0.412 0.532 0.159
word3 0.334 0.241 0.324 0.188
...
word9 0.334 0.241 0.324 0.188
How to use
----------
>>> from gensim.test.utils import datapath, get_tmpfile
>>> from gensim.models import KeyedVectors
>>>
>>> glove_file = datapath('test_glove.txt')
>>> tmp_file = get_tmpfile("test_word2vec.txt")
>>>
>>> # call glove2word2vec script
>>> # default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>
>>> from gensim.scripts.glove2word2vec import glove2word2vec
>>> glove2word2vec(glove_file, tmp_file)
>>>
>>> model = KeyedVectors.load_word2vec_format(tmp_file)
Command line arguments
----------------------
.. program-output:: python -m gensim.scripts.glove2word2vec --help
:ellipsis: 0, -5
"""
import sys
import logging
import argparse
Expand All @@ -29,7 +65,19 @@


def get_glove_info(glove_file_name):
"""Return the number of vectors and dimensions in a file in GloVe format."""
"""Get number of vectors in provided `glove_file_name` and dimension of vectors.
Parameters
----------
glove_file_name : str
Path to file in GloVe format.
Returns
-------
(int, int)
Number of vectors (lines) of input file and its dimension.
"""
with smart_open(glove_file_name) as f:
num_lines = sum(1 for _ in f)
with smart_open(glove_file_name) as f:
Expand All @@ -38,7 +86,21 @@ def get_glove_info(glove_file_name):


def glove2word2vec(glove_input_file, word2vec_output_file):
"""Convert `glove_input_file` in GloVe format into `word2vec_output_file` in word2vec format."""
"""Convert `glove_input_file` in GloVe format to word2vec format and write it to `word2vec_output_file`.
Parameters
----------
glove_input_file : str
Path to file in GloVe format.
word2vec_output_file: str
Path to output file.
Returns
-------
(int, int)
Number of vectors (lines) of input file and its dimension.
"""
num_lines, num_dims = get_glove_info(glove_input_file)
logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file)
with smart_open(word2vec_output_file, 'wb') as fout:
Expand All @@ -50,17 +112,12 @@ def glove2word2vec(glove_input_file, word2vec_output_file):


if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.setLevel(level=logging.INFO)
logger.info("running %s", ' '.join(sys.argv))

parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", required=True, help="Input file, in gloVe format (read-only).")
parser.add_argument(
"-o", "--output", required=True, help="Output file, in word2vec text format (will be overwritten)."
)
logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO)
parser = argparse.ArgumentParser(description=__doc__[:-135], formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("-i", "--input", required=True, help="Path to input file in GloVe format")
parser.add_argument("-o", "--output", required=True, help="Path to output file")
args = parser.parse_args()

# do the actual conversion
logger.info("running %s", ' '.join(sys.argv))
num_lines, num_dims = glove2word2vec(args.input, args.output)
logger.info('Converted model with %i vectors and %i dimensions', num_lines, num_dims)
65 changes: 38 additions & 27 deletions gensim/scripts/segment_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,39 +4,51 @@
# Author: Jayant Jain <jayant@rare-technologies.com>
# Copyright (C) 2016 RaRe Technologies

"""
CLI script for extracting plain text out of a raw Wikipedia dump. Input is an xml.bz2 file provided by MediaWiki \
that looks like <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 or <LANG>wiki-latest-pages-articles.xml.bz2 \
"""This script using for extracting plain text out of a raw Wikipedia dump. Input is an xml.bz2 file provided
by MediaWiki that looks like <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 or <LANG>wiki-latest-pages-articles.xml.bz2
(e.g. 14 GB of https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2).
It streams through all the XML articles using multiple cores (#cores - 1, by default), \
It streams through all the XML articles using multiple cores (#cores - 1, by default),
decompressing on the fly and extracting plain text from the articles and their sections.
For each extracted article, it prints its title, section names and plain text section contents, in json-line format.
Examples
--------
How to use
----------
#. Process Wikipedia dump with this script ::
python -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz
#. Read output in simple way
>>> from smart_open import smart_open
>>> import json
>>>
>>> # iterate over the plain text data we just created
>>> for line in smart_open('enwiki-latest.json.gz'):
>>> # decode each JSON line into a Python dictionary object
>>> article = json.loads(line)
>>>
>>> # each article has a "title", a mapping of interlinks and a list of "section_titles" and "section_texts".
>>> print("Article title: %s" % article['title'])
>>> print("Interlinks: %s" + article['interlinks'])
>>> for section_title, section_text in zip(article['section_titles'], article['section_texts']):
>>> print("Section title: %s" % section_title)
>>> print("Section text: %s" % section_text)
Notes
-----
Processing the entire English Wikipedia dump takes 1.7 hours (about 3 million articles per hour,
or 10 MB of XML per second) on an 8 core Intel i7-7700 @3.60GHz.
python -m gensim.scripts.segment_wiki -h
python -m gensim.scripts.segment_wiki -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz
Command line arguments
----------------------
Processing the entire English Wikipedia dump takes 1.7 hours (about 3 million articles per hour, \
or 10 MB of XML per second) on an 8 core Intel i7-7700 @3.60GHz.
.. program-output:: python -m gensim.scripts.segment_wiki --help
:ellipsis: 0, -10
You can then read the created output (~6.1 GB gzipped) with:
>>> # iterate over the plain text data we just created
>>> for line in smart_open('enwiki-latest.json.gz'):
>>> # decode each JSON line into a Python dictionary object
>>> article = json.loads(line)
>>>
>>> # each article has a "title", a mapping of interlinks and a list of "section_titles" and "section_texts".
>>> print("Article title: %s" % article['title'])
>>> print("Interlinks: %s" + article['interlinks'])
>>> for section_title, section_text in zip(article['section_titles'], article['section_texts']):
>>> print("Section title: %s" % section_title)
>>> print("Section text: %s" % section_text)
"""

import argparse
Expand Down Expand Up @@ -338,10 +350,8 @@ def get_texts_with_sections(self):


if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s : %(processName)s : %(levelname)s : %(message)s', level=logging.INFO)
logger.info("running %s", " ".join(sys.argv))

parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__'])
logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO)
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=__doc__[:-136])
default_workers = max(1, multiprocessing.cpu_count() - 1)
parser.add_argument('-f', '--file', help='Path to MediaWiki database dump (read-only).', required=True)
parser.add_argument(
Expand All @@ -367,6 +377,7 @@ def get_texts_with_sections(self):
)
args = parser.parse_args()

logger.info("running %s", " ".join(sys.argv))
segment_and_write_all_articles(
args.file, args.output,
min_article_character=args.min_article_character,
Expand Down
71 changes: 40 additions & 31 deletions gensim/scripts/word2vec2tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,37 @@
# Copyright (C) 2016 Silvio Olivastri <silvio.olivastri@gmail.com>
# Copyright (C) 2016 Radim Rehurek <radim@rare-technologies.com>

"""
USAGE: $ python -m gensim.scripts.word2vec2tensor --input <Word2Vec model file> --output <TSV tensor filename prefix> \
[--binary] <Word2Vec binary flag>

Where:
"""This script allows converting word-vectors from word2vec format into Tensorflow 2D tensor and metadata format.
This script used for for word-vector visualization on `Embedding Visualization <http://projector.tensorflow.org/>`_.
* <Word2Vec model file>: Input Word2Vec model.
* <TSV tensor filename prefix>: 2D tensor TSV output file name prefix.
* <Word2Vec binary flag>: Set True if Word2Vec model is binary. Defaults to False.
Output:
The script will create two TSV files. A 2d tensor format file, and a Word Embedding metadata file. Both files will
use the --output file name as prefix.
How to use
----------
#. Convert your word-vector with this script (for example, we'll use model from
`gensim-data <https://rare-technologies.com/new-download-api-for-pretrained-nlp-models-and-datasets-in-gensim/>`_) ::
This script is used to convert the word2vec format to Tensorflow 2D tensor
and metadata formats for Embedding Visualization.
To use the generated TSV 2D tensor and metadata file in the Projector Visualizer, please
python -m gensim.downloader -d glove-wiki-gigaword-50 # download model in word2vec format
python -m gensim.scripts.word2vec2tensor -i ~/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz \
-o /tmp/my_model_prefix
1) Open http://projector.tensorflow.org/.
2) Choose "Load Data" from the left menu.
3) Select "Choose file" in "Load a TSV file of vectors." and choose you local "_tensor.tsv" file.
4) Select "Choose file" in "Load a TSV file of metadata." and choose you local "_metadata.tsv" file.
#. Open http://projector.tensorflow.org/
#. Click "Load Data" button from the left menu.
#. Select "Choose file" in "Load a TSV file of vectors." and choose "/tmp/my_model_prefix_tensor.tsv" file.
#. Select "Choose file" in "Load a TSV file of metadata." and choose "/tmp/my_model_prefix_metadata.tsv" file.
#. ???
#. PROFIT!
For more information about TensorBoard TSV format please visit:
https://www.tensorflow.org/versions/master/how_tos/embedding_viz/
Command line arguments
----------------------
.. program-output:: python -m gensim.scripts.word2vec2tensor --help
:ellipsis: 0, -7
"""

import os
Expand All @@ -44,12 +49,18 @@


def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
"""Convert Word2Vec mode to 2D tensor TSV file and metadata file
"""Convert file in Word2Vec format and writes two files 2D tensor TSV file.
File "tensor_filename"_tensor.tsv contains word-vectors, "tensor_filename"_metadata.tsv contains words.
Args:
word2vec_model_path (str): word2vec model file path.
tensor_filename (str): filename prefix.
binary (bool): set True to use a binary Word2Vec model, defaults to False.
Parameters
----------
word2vec_model_path : str
Path to file in Word2Vec format.
tensor_filename : str
Prefix for output files.
binary : bool, optional
True if input file in binary format.
"""
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary)
Expand All @@ -68,18 +79,16 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):


if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.setLevel(level=logging.INFO)
logger.info("running %s", ' '.join(sys.argv))

parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", required=True, help="Input word2vec model")
parser.add_argument("-o", "--output", required=True, help="Output tensor file name prefix")
logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO)
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__[:-138])
parser.add_argument("-i", "--input", required=True, help="Path to input file in word2vec format")
parser.add_argument("-o", "--output", required=True, help="Prefix path for output files")
parser.add_argument(
"-b", "--binary", required=False, help="If word2vec model in binary format, set True, else False"
"-b", "--binary", action='store_const', const=True, default=False,
help="Set this flag if word2vec model in binary format (default: %(default)s)"
)
args = parser.parse_args()

logger.info("running %s", ' '.join(sys.argv))
word2vec2tensor(args.input, args.output, args.binary)

logger.info("finished running %s", os.path.basename(sys.argv[0]))
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def finalize_options(self):
'distributed': distributed_env,
'test-win': win_testenv,
'test': linux_testenv,
'docs': linux_testenv + distributed_env + ['sphinx', 'sphinxcontrib-napoleon', 'plotly', 'pattern'],
'docs': linux_testenv + distributed_env + ['sphinx', 'sphinxcontrib-napoleon', 'plotly', 'pattern', 'sphinxcontrib.programoutput'],
},

include_package_data=True,
Expand Down

0 comments on commit 117d447

Please sign in to comment.