unify class comments under __init__ for consistncy w/ api doc present…

…ation
piskvorky · Dec 14, 2019 · 7cf2853 · 7cf2853
1 parent 0a0751d
commit 7cf2853
Show file tree

Hide file tree

Showing 5 changed files with 220 additions and 243 deletions.
diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
@@ -54,27 +54,16 @@
 
 
 class BaseAny2VecModel(utils.SaveLoad):
-    r"""Base class for training, using and evaluating \*2vec model.
-
-    Contains implementation for multi-threaded training. The purpose of this class is to provide a
-    reference interface for concrete embedding implementations, whether the input space is a corpus
-    of words, documents or anything else. At the same time, functionality that we expect to be common
-    for those implementations is provided here to avoid code duplication.
-
-    In the special but usual case where the input space consists of words, a more specialized layer
-    is provided, consider inheriting from :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
-
-    Notes
-    -----
-    A subclass should initialize the following attributes:
+    def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000):
+        r"""Base class for training, using and evaluating \*2vec model.
 
-    * self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example)
-    * self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example)
-    * self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example)
+        Contains implementation for multi-threaded training. The purpose of this class is to provide a
+        reference interface for concrete embedding implementations, whether the input space is a corpus
+        of words, documents or anything else. At the same time, functionality that we expect to be common
+        for those implementations is provided here to avoid code duplication.
 
-    """
-    def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000):
-        """
+        In the special but usual case where the input space consists of words, a more specialized layer
+        is provided, consider inheriting from :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
 
         Parameters
         ----------
@@ -89,6 +78,14 @@ def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_wor
         batch_words : int, optional
             Number of words to be processed by a single job.
 
+        Notes
+        -----
+        A subclass should initialize the following attributes:
+
+        * self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example)
+        * self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example)
+        * self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example)
+
         """
         self.vector_size = int(vector_size)
         self.workers = int(workers)
@@ -601,7 +598,7 @@ def load(cls, fname_or_handle, **kwargs):
         return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs)
 
     def save(self, fname_or_handle, **kwargs):
-        """"Save the object to file.
+        """Save the object to file.
 
         Parameters
         ----------
@@ -620,33 +617,10 @@ def save(self, fname_or_handle, **kwargs):
 
 
 class BaseWordEmbeddingsModel(BaseAny2VecModel):
-    """Base class containing common methods for training, using & evaluating word embeddings learning models.
-
-    See Also
-    --------
-    :class:`~gensim.models.word2vec.Word2Vec`.
-        Word2Vec model - embeddings for words.
-    :class:`~gensim.models.fasttext.FastText`.
-        FastText model - embeddings for words (ngram-based).
-    :class:`~gensim.models.doc2vec.Doc2Vec`.
-        Doc2Vec model - embeddings for documents.
-    :class:`~gensim.models.poincare.PoincareModel`
-        Poincare model - embeddings for graphs.
-
-    """
-    def _clear_post_train(self):
-        raise NotImplementedError()
-
-    def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
-        raise NotImplementedError()
-
-    def _set_train_params(self, **kwargs):
-        raise NotImplementedError()
-
     def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100, epochs=5, callbacks=(),
                  batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5,
                  ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, **kwargs):
-        """
+        """Base class containing common methods for training, using & evaluating word embeddings learning models.
 
         Parameters
         ----------
@@ -713,6 +687,17 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
         **kwargs : object
             Key word arguments needed to allow children classes to accept more arguments.
 
+        See Also
+        --------
+        :class:`~gensim.models.word2vec.Word2Vec`.
+            Word2Vec model - embeddings for words.
+        :class:`~gensim.models.fasttext.FastText`.
+            FastText model - embeddings for words (ngram-based).
+        :class:`~gensim.models.doc2vec.Doc2Vec`.
+            Doc2Vec model - embeddings for documents.
+        :class:`~gensim.models.poincare.PoincareModel`
+            Poincare model - embeddings for graphs.
+
         """
         self.sg = int(sg)
         if vector_size % 4 != 0:
@@ -753,6 +738,15 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
                     "and is not stored as part of the model. Model initialized without sentences. "
                     "trim_rule provided, if any, will be ignored.")
 
+    def _clear_post_train(self):
+        raise NotImplementedError()
+
+    def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
+        raise NotImplementedError()
+
+    def _set_train_params(self, **kwargs):
+        raise NotImplementedError()
+
     def __str__(self):
         """Get a human readable representation of the object.
 

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -161,43 +161,11 @@ def count(self):
 
 
 class Doc2Vec(BaseWordEmbeddingsModel):
-    """Class for training, using and evaluating neural networks described in
-    `Distributed Representations of Sentences and Documents <http://arxiv.org/abs/1405.4053v2>`_.
-
-    Some important internal attributes are the following:
-
-    Attributes
-    ----------
-    wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
-        This object essentially contains the mapping between words and embeddings. After training, it can be used
-        directly to query those embeddings in various ways. See the module level docstring for examples.
-
-    docvecs : :class:`~gensim.models.keyedvectors.KeyedVectors`
-        This object contains the paragraph vectors learned from the training data. There will be one such vector
-        for each unique document tag supplied during training. They may be individually accessed using the tag
-        as an indexed-access key. For example, if one of the training documents used a tag of 'doc003':
-
-        .. sourcecode:: pycon
-
-            >>> model.docvecs['doc003']
-
-    vocabulary : :class:`~gensim.models.doc2vec.Doc2VecVocab`
-        This object represents the vocabulary (sometimes called Dictionary in gensim) of the model.
-        Besides keeping track of all unique words, this object provides extra functionality, such as
-        sorting words by frequency, or discarding extremely rare words.
-
-    trainables : :class:`~gensim.models.doc2vec.Doc2VecTrainables`
-        This object represents the inner shallow neural network used to train the embeddings. The semantics of the
-        network differ slightly in the two available training modes (CBOW or SG) but you can think of it as a NN with
-        a single projection and hidden layer which we train on the corpus. The weights are then used as our embeddings
-        The only addition to the underlying NN used in :class:`~gensim.models.word2vec.Word2Vec` is that the input
-        includes not only the word vectors of each word in the context, but also the paragraph vector.
-
-    """
     def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0,
                  dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(),
                  **kwargs):
-        """
+        """Class for training, using and evaluating neural networks described in
+        `Distributed Representations of Sentences and Documents <http://arxiv.org/abs/1405.4053v2>`_.
 
         Parameters
         ----------
@@ -286,6 +254,36 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo
         callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional
             List of callbacks that need to be executed/run at specific stages during training.
 
+        Some important internal attributes are the following:
+
+        Attributes
+        ----------
+        wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
+            This object essentially contains the mapping between words and embeddings. After training, it can be used
+            directly to query those embeddings in various ways. See the module level docstring for examples.
+
+        docvecs : :class:`~gensim.models.keyedvectors.KeyedVectors`
+            This object contains the paragraph vectors learned from the training data. There will be one such vector
+            for each unique document tag supplied during training. They may be individually accessed using the tag
+            as an indexed-access key. For example, if one of the training documents used a tag of 'doc003':
+
+            .. sourcecode:: pycon
+
+                >>> model.docvecs['doc003']
+
+        vocabulary : :class:`~gensim.models.doc2vec.Doc2VecVocab`
+            This object represents the vocabulary (sometimes called Dictionary in gensim) of the model.
+            Besides keeping track of all unique words, this object provides extra functionality, such as
+            sorting words by frequency, or discarding extremely rare words.
+
+        trainables : :class:`~gensim.models.doc2vec.Doc2VecTrainables`
+            This object represents the inner shallow neural network used to train the embeddings. The semantics
+            of the network differ slightly in the two available training modes (CBOW or SG) but you can think
+            of it as a NN with a single projection and hidden layer which we train on the corpus. The weights are
+            then used as our embeddings. The only addition to the underlying NN used in
+            :class:`~gensim.models.word2vec.Word2Vec` is that the input includes not only the word vectors of
+            each word in the context, but also the paragraph vector.
+
         """
         super(Doc2Vec, self).__init__(
             sg=(1 + dm) % 2,
@@ -962,13 +960,10 @@ def similarity_unseen_docs(self, doc_words1, doc_words2, alpha=None, min_alpha=N
 
 
 class Doc2VecVocab(Word2VecVocab):
-    """Vocabulary used by :class:`~gensim.models.doc2vec.Doc2Vec`.
-
-    This includes a mapping from words found in the corpus to their total frequency count.
-
-    """
     def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75):
-        """
+        """Vocabulary used by :class:`~gensim.models.doc2vec.Doc2Vec`.
+
+        This includes a mapping from words found in the corpus to their total frequency count.
 
         Parameters
         ----------
@@ -1112,8 +1107,8 @@ def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_pe
 
 
 class Doc2VecTrainables(Word2VecTrainables):
-    """Represents the inner shallow neural network used to train :class:`~gensim.models.doc2vec.Doc2Vec`."""
     def __init__(self, dm=1, dm_concat=0, dm_tag_count=1, vector_size=100, seed=1, hashfxn=hash, window=5):
+        """Represents the inner shallow neural network used to train :class:`~gensim.models.doc2vec.Doc2Vec`."""
         super(Doc2VecTrainables, self).__init__(
             vector_size=vector_size, seed=seed, hashfxn=hashfxn)
         if dm and dm_concat:
@@ -1145,10 +1140,8 @@ def reset_doc_weights(self, docvecs):
 
 
 class TaggedBrownCorpus(object):
-    """Reader for the `Brown corpus (part of NLTK data) <http://www.nltk.org/book/ch02.html#tab-brown-sources>`_."""
-
     def __init__(self, dirname):
-        """
+        """Reader for the `Brown corpus (part of NLTK data) <http://www.nltk.org/book/ch02.html#tab-brown-sources>`_.
 
         Parameters
         ----------
@@ -1185,14 +1178,11 @@ def __iter__(self):
 
 
 class TaggedLineDocument(object):
-    """Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.
-
-    Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
-    automatically from the document line number (each document gets a unique integer tag).
-
-    """
     def __init__(self, source):
-        """
+        """Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.
+
+        Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
+        automatically from the document line number (each document gets a unique integer tag).
 
         Parameters
         ----------