piskvorky · menshikh-iv · Jul 12, 2018 · Jun 4, 2018 · Jun 4, 2018 · Jun 4, 2018
diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
@@ -330,7 +330,37 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam
 
     def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, total_examples=None,
                      total_words=None, queue_factor=2, report_delay=1.0):
-        """Train one epoch."""
+        """Train the model for a single epoch.
+
+        Parameters
+        ----------
+        data_iterable : iterable of list of object
+            The input corpus. This will be split in chunks and these chunks will be pushed to the queue.
+        data_iterables : iterable of iterables of list of object
+            The iterable of input streams like `data_iterable`. Use this parameter in multistream mode.
+        cur_epoch : int, optional
+            The current training epoch, needed to compute the training parameters for each job.
+            For example in many implementations the learning rate would be dropping with the number of epochs.
+        total_examples : int, optional
+            Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences
+            in a corpus, used to log progress.
+        total_words : int, optional
+            Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw
+            words in a corpus, used to log progress.
+        queue_factor : int, optional
+            Multiplier for size of queue -> size = number of workers * queue_factor.
+        report_delay : float, optional
+            Number of seconds between two consecutive progress report messages in the logger.
+
+        Returns
+        -------
+        (int, int, int)
+            The training report for this epoch consisting of three elements:
+                * Size of data chunk processed, for example number of sentences in the corpus chunk.
+                * Effective word count used in training (after ignoring unknown words and trimming the sentence length).
+                * Total word count used in training.
+
+        """
         self._check_input_data_sanity(data_iterable, data_iterables)
         job_queue = Queue(maxsize=queue_factor * self.workers)
         progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
@@ -762,7 +792,7 @@ def build_vocab(self, sentences=None, input_streams=None, workers=None, update=F
             Can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
             See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
-            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
+            or :class:`~gensim.models.word2vec.LineSentence` module for such examples.
         input_streams : list or tuple of iterable of iterables
             The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible
             to process streams in parallel, using `workers` parameter.

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -402,26 +402,59 @@ class Doc2Vec(BaseWordEmbeddingsModel):
     """Class for training, using and evaluating neural networks described in
     `Distributed Representations of Sentences and Documents <http://arxiv.org/abs/1405.4053v2>`_.
 
+    Some important internal attributes are the following:
+
+    Attributes
+    ----------
+    wv : :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors`
+        This object essentially contains the mapping between words and embeddings. After training, it can be used
+        directly to query those embeddings in various ways. See the module level docstring for examples.
+
+    docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors`
+        This object contains the paragraph vectors. Remember that the only difference between this model and
+        :class:`~gensim.models.word2vec.Word2Vec` is that besides the word vectors we also include paragraph embeddings
+        to capture the paragraph.
+
+        In this way we can capture the difference between the same word used in a different context.
+        For example we now have a different representation of the word "leaves" in the following two sentences ::
+
+            1. Manos leaves the office every day at 18:00 to catch his train
+            2. This season is called Fall, because leaves fall from the trees.
+
+        In a plain :class:`~gensim.models.word2vec.Word2Vec` model the word would have exactly the same representation
+        in both sentences, in :class:`~gensim.models.doc2vec.Doc2Vec` it will not.
+
+    vocabulary : :class:`~gensim.models.doc2vec.Doc2VecVocab`
+        This object represents the vocabulary (sometimes called Dictionary in gensim) of the model.
+        Besides keeping track of all unique words, this object provides extra functionality, such as
+        sorting words by frequency, or discarding extremely rare words.
+
+    trainables : :class:`~gensim.models.doc2vec.Doc2VecTrainables`
+        This object represents the inner shallow neural network used to train the embeddings. The semantics of the
+        network differ slightly in the two available training modes (CBOW or SG) but you can think of it as a NN with
+        a single projection and hidden layer which we train on the corpus. The weights are then used as our embeddings
+        The only addition to the underlying NN used in :class:`~gensim.models.word2vec.Word2Vec` is that the input
+        includes not only the word vectors of each word in the context, but also the paragraph vector.
+
+    """
     def __init__(self, documents=None, input_streams=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0,
                  dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(),
                  **kwargs):
-        """Initialize the model from an iterable of `documents`. Each document is a
-        TaggedDocument object that will be used for training.
+        """
 
         Parameters
         ----------
-        documents : {iterable of iterables, list or tuple of iterable of iterables}
-            The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora,
-            consider an iterable that streams the documents directly from disk/network.
-            If you don't supply `documents`, the model is left uninitialized -- use if
-            you plan to initialize it in some other way.
+        documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
+            Input corpus, can be simply a list of elements, but for larger corpora,consider an iterable that streams
+            the documents directly from disk/network. If you don't supply `documents`, the model is
+            left uninitialized -- use if you plan to initialize it in some other way.
         input_streams : list or tuple of iterable of iterables
             The tuple or list of `documents`-like arguments. Use it if you have multiple input streams. It is possible
             to process streams in parallel, using `workers` parameter.
-        dm : int {1,0}
+        dm : {1,0}, optional
             Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
             Otherwise, `distributed bag of words` (PV-DBOW) is employed.
-        size : int
+        size : int, optional
             Dimensionality of the feature vectors.
         window : int, optional
             The maximum distance between the current and predicted word within a sentence.
@@ -656,15 +689,14 @@ def train(self, documents=None, input_streams=None, total_examples=None, total_w
 
         Parameters
         ----------
-        documents : iterable of iterables
-            The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora,
-            consider an iterable that streams the documents directly from disk/network.
-            See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument`
-            in :mod:`~gensim.models.doc2vec` module for such examples.
+        documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`
+            Can be simply a list of elements, but for larger corpora,consider an iterable that streams
+            the documents directly from disk/network. If you don't supply `documents`, the model is
+            left uninitialized -- use if you plan to initialize it in some other way.
         input_streams : list or tuple of iterable of iterables
             The tuple or list of `documents`-like arguments. Use it if you have multiple input streams. It is possible
             to process streams in parallel, using `workers` parameter.
-        total_examples : int
+        total_examples : int, optional
             Count of sentences.
         total_words : int, optional
             Count of raw words in documents.
@@ -975,19 +1007,17 @@ def build_vocab(self, documents=None, input_streams=None, update=False, progress
 
         Parameters
         ----------
-        documents : {iterable of iterables, list or tuple of iterable of iterables}
-            The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora,
+        documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`
+            Can be simply a list of :class:`~gensim.models.doc2vec.TaggedDocument` elements, but for larger corpora,
             consider an iterable that streams the documents directly from disk/network.
             See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument`
-            in :mod:`~gensim.models.doc2vec` module for such examples.
         input_streams : list or tuple of iterable of iterables
             The tuple or list of `documents`-like arguments. Use it if you have multiple input streams. It is possible
             to process streams in parallel, using `workers` parameter.
-        progress_per : int
-            Indicates how many words to process before showing/updating the progress.
         update : bool
             If true, the new words in `sentences` will be added to model's vocab.
-            in :mod:`~gensim.models.doc2vec` module for such examples.
+        progress_per : int
+            Indicates how many words to process before showing/updating the progress.
         keep_raw_vocab : bool
             If not true, delete the raw vocabulary after the scaling is done and free up RAM.
         trim_rule : function, optional
@@ -998,9 +1028,16 @@ def build_vocab(self, documents=None, input_streams=None, update=False, progress
             :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
             The rule, if given, is only used to prune vocabulary during current method call and is not stored as part
             of the model.
+            The input parameters are of the following types:
+                * `word` (str) - the word we are examining
+                * `count` (int) - the word's frequency count in the corpus
+                * `min_count` (int) - the minimum count threshold.
+
         workers : int
             Used if `input_streams` is passed. Determines how many processes to use for vocab building.
             Actual number of workers is determined by `min(len(input_streams), workers)`.
+        **kwargs
+            Additional key word arguments passed to the internal vocabulary construction.
 
         """
         workers = workers or self.workers
@@ -1237,6 +1274,36 @@ def _scan_vocab_singlestream(self, documents, docvecs, progress_per, trim_rule):
 
     def scan_vocab(self, documents=None, input_streams=None, docvecs=None, progress_per=10000, workers=None,
                    trim_rule=None):
+        """Create the models Vocabulary: A mapping from unique words in the corpus to their frequency count.
+
+        Parameters
+        ----------
+        documents : iterable of :class:`~gensim.models.doc2vec.TaggedDocument`
+            The tagged documents used to create the vocabulary. Their tags can be either str tokens or ints (faster).
+        docvecs : list of :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors`
+            The vector representations of the documents in our corpus. Each of them has a size == `vector_size`.
+        progress_per : int
+            Progress will be logged every `progress_per` documents.
+        trim_rule : function, optional
+            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
+            be trimmed away, or handled using the default (discard if word count < min_count).
+            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
+            or a callable that accepts parameters (word, count, min_count) and returns either
+            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
+            The rule, if given, is only used to prune vocabulary during
+            :meth:`~gensim.models.doc2vec.Doc2Vec.build_vocab` and is not stored as part of the model.
+
+            The input parameters are of the following types:
+                * `word` (str) - the word we are examining
+                * `count` (int) - the word's frequency count in the corpus
+                * `min_count` (int) - the minimum count threshold.
+
+        Returns
+        -------
+        (int, int)
+            Tuple of (Total words in the corpus, number of documents)
+
+        """
         logger.info("collecting all words and their counts")
         if input_streams is None:
             total_words, corpus_count = self._scan_vocab_singlestream(documents, docvecs, progress_per, trim_rule)

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -249,8 +249,8 @@ def __init__(self, sentences=None, input_streams=None, sg=0, hs=0, size=100, alp
 
         Parameters
         ----------
-        sentences : {iterable of iterables, list or tuple of iterable of iterables}
-            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
+        sentences : iterable of list of str, optional
+            Can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
             See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
             or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
@@ -259,11 +259,11 @@ def __init__(self, sentences=None, input_streams=None, sg=0, hs=0, size=100, alp
         input_streams : list or tuple of iterable of iterables
             The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible
             to process streams in parallel, using `workers` parameter.
-        sg : int {1, 0}
-            Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed.
-        size : int
-            Dimensionality of the feature vectors.
-        window : int
+        min_count : int, optional
+            The model ignores all words with total frequency lower than this.
+        size : int, optional
+            Dimensionality of the word vectors.
+        window : int, optional
             The maximum distance between the current and predicted word within a sentence.
         workers : int, optional
             Use these many worker threads to train the model (=faster training with multicore machines).
@@ -344,7 +344,6 @@ def __init__(self, sentences=None, input_streams=None, sg=0, hs=0, size=100, alp
         >>> say_vector = model['say']  # get vector for word
         >>> of_vector = model['of']  # get vector for out-of-vocab word
 
-
         """
         self.load = call_on_class_only
         self.load_fasttext_format = call_on_class_only
@@ -431,6 +430,10 @@ def build_vocab(self, sentences=None, input_streams=None, update=False, progress
         input_streams : list or tuple of iterable of iterables
             The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible
             to process streams in parallel, using `workers` parameter.
+        update : bool
+            If true, the new words in `sentences` will be added to model's vocab.
+        progress_per : int
+            Indicates how many words to process before showing/updating the progress.
         keep_raw_vocab : bool
             If not true, delete the raw vocabulary after the scaling is done and free up RAM.
         trim_rule : function, optional
@@ -439,21 +442,17 @@ def build_vocab(self, sentences=None, input_streams=None, update=False, progress
             Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
             or a callable that accepts parameters (word, count, min_count) and returns either
             :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
-            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
-            of the model.
-        progress_per : int
-            Indicates how many words to process before showing/updating the progress.
-        update : bool
-            If true, the new words in `sentences` will be added to model's vocab.
-        workers : int
-            Used if `input_streams` is passed. Determines how many processes to use for vocab building.
-            Actual number of workers is determined by `min(len(input_streams), workers)`.
+            The rule, if given, is only used to prune vocabulary during
+            :meth:`~gensim.models.fasttext.FastText.build_vocab` and is not stored as part of the model.
 
             The input parameters are of the following types:
                 * `word` (str) - the word we are examining
                 * `count` (int) - the word's frequency count in the corpus
                 * `min_count` (int) - the minimum count threshold.
 
+        workers : int
+            Used if `input_streams` is passed. Determines how many processes to use for vocab building.
+            Actual number of workers is determined by `min(len(input_streams), workers)`.
         **kwargs
             Additional key word parameters passed to
             :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.build_vocab`.