Skip to content

Commit

Permalink
unify class comments under __init__ for consistncy w/ api doc present…
Browse files Browse the repository at this point in the history
…ation
  • Loading branch information
gojomo committed Dec 14, 2019
1 parent 0a0751d commit 7cf2853
Show file tree
Hide file tree
Showing 5 changed files with 220 additions and 243 deletions.
82 changes: 38 additions & 44 deletions gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,27 +54,16 @@


class BaseAny2VecModel(utils.SaveLoad):
r"""Base class for training, using and evaluating \*2vec model.
Contains implementation for multi-threaded training. The purpose of this class is to provide a
reference interface for concrete embedding implementations, whether the input space is a corpus
of words, documents or anything else. At the same time, functionality that we expect to be common
for those implementations is provided here to avoid code duplication.
In the special but usual case where the input space consists of words, a more specialized layer
is provided, consider inheriting from :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
Notes
-----
A subclass should initialize the following attributes:
def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000):
r"""Base class for training, using and evaluating \*2vec model.
* self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example)
* self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example)
* self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example)
Contains implementation for multi-threaded training. The purpose of this class is to provide a
reference interface for concrete embedding implementations, whether the input space is a corpus
of words, documents or anything else. At the same time, functionality that we expect to be common
for those implementations is provided here to avoid code duplication.
"""
def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000):
"""
In the special but usual case where the input space consists of words, a more specialized layer
is provided, consider inheriting from :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
Parameters
----------
Expand All @@ -89,6 +78,14 @@ def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_wor
batch_words : int, optional
Number of words to be processed by a single job.
Notes
-----
A subclass should initialize the following attributes:
* self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example)
* self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example)
* self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example)
"""
self.vector_size = int(vector_size)
self.workers = int(workers)
Expand Down Expand Up @@ -601,7 +598,7 @@ def load(cls, fname_or_handle, **kwargs):
return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs)

def save(self, fname_or_handle, **kwargs):
""""Save the object to file.
"""Save the object to file.
Parameters
----------
Expand All @@ -620,33 +617,10 @@ def save(self, fname_or_handle, **kwargs):


class BaseWordEmbeddingsModel(BaseAny2VecModel):
"""Base class containing common methods for training, using & evaluating word embeddings learning models.
See Also
--------
:class:`~gensim.models.word2vec.Word2Vec`.
Word2Vec model - embeddings for words.
:class:`~gensim.models.fasttext.FastText`.
FastText model - embeddings for words (ngram-based).
:class:`~gensim.models.doc2vec.Doc2Vec`.
Doc2Vec model - embeddings for documents.
:class:`~gensim.models.poincare.PoincareModel`
Poincare model - embeddings for graphs.
"""
def _clear_post_train(self):
raise NotImplementedError()

def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
raise NotImplementedError()

def _set_train_params(self, **kwargs):
raise NotImplementedError()

def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100, epochs=5, callbacks=(),
batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5,
ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, **kwargs):
"""
"""Base class containing common methods for training, using & evaluating word embeddings learning models.
Parameters
----------
Expand Down Expand Up @@ -713,6 +687,17 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
**kwargs : object
Key word arguments needed to allow children classes to accept more arguments.
See Also
--------
:class:`~gensim.models.word2vec.Word2Vec`.
Word2Vec model - embeddings for words.
:class:`~gensim.models.fasttext.FastText`.
FastText model - embeddings for words (ngram-based).
:class:`~gensim.models.doc2vec.Doc2Vec`.
Doc2Vec model - embeddings for documents.
:class:`~gensim.models.poincare.PoincareModel`
Poincare model - embeddings for graphs.
"""
self.sg = int(sg)
if vector_size % 4 != 0:
Expand Down Expand Up @@ -753,6 +738,15 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
"and is not stored as part of the model. Model initialized without sentences. "
"trim_rule provided, if any, will be ignored.")

def _clear_post_train(self):
raise NotImplementedError()

def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
raise NotImplementedError()

def _set_train_params(self, **kwargs):
raise NotImplementedError()

def __str__(self):
"""Get a human readable representation of the object.
Expand Down
92 changes: 41 additions & 51 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,43 +161,11 @@ def count(self):


class Doc2Vec(BaseWordEmbeddingsModel):
"""Class for training, using and evaluating neural networks described in
`Distributed Representations of Sentences and Documents <http://arxiv.org/abs/1405.4053v2>`_.
Some important internal attributes are the following:
Attributes
----------
wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
This object essentially contains the mapping between words and embeddings. After training, it can be used
directly to query those embeddings in various ways. See the module level docstring for examples.
docvecs : :class:`~gensim.models.keyedvectors.KeyedVectors`
This object contains the paragraph vectors learned from the training data. There will be one such vector
for each unique document tag supplied during training. They may be individually accessed using the tag
as an indexed-access key. For example, if one of the training documents used a tag of 'doc003':
.. sourcecode:: pycon
>>> model.docvecs['doc003']
vocabulary : :class:`~gensim.models.doc2vec.Doc2VecVocab`
This object represents the vocabulary (sometimes called Dictionary in gensim) of the model.
Besides keeping track of all unique words, this object provides extra functionality, such as
sorting words by frequency, or discarding extremely rare words.
trainables : :class:`~gensim.models.doc2vec.Doc2VecTrainables`
This object represents the inner shallow neural network used to train the embeddings. The semantics of the
network differ slightly in the two available training modes (CBOW or SG) but you can think of it as a NN with
a single projection and hidden layer which we train on the corpus. The weights are then used as our embeddings
The only addition to the underlying NN used in :class:`~gensim.models.word2vec.Word2Vec` is that the input
includes not only the word vectors of each word in the context, but also the paragraph vector.
"""
def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0,
dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(),
**kwargs):
"""
"""Class for training, using and evaluating neural networks described in
`Distributed Representations of Sentences and Documents <http://arxiv.org/abs/1405.4053v2>`_.
Parameters
----------
Expand Down Expand Up @@ -286,6 +254,36 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo
callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional
List of callbacks that need to be executed/run at specific stages during training.
Some important internal attributes are the following:
Attributes
----------
wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
This object essentially contains the mapping between words and embeddings. After training, it can be used
directly to query those embeddings in various ways. See the module level docstring for examples.
docvecs : :class:`~gensim.models.keyedvectors.KeyedVectors`
This object contains the paragraph vectors learned from the training data. There will be one such vector
for each unique document tag supplied during training. They may be individually accessed using the tag
as an indexed-access key. For example, if one of the training documents used a tag of 'doc003':
.. sourcecode:: pycon
>>> model.docvecs['doc003']
vocabulary : :class:`~gensim.models.doc2vec.Doc2VecVocab`
This object represents the vocabulary (sometimes called Dictionary in gensim) of the model.
Besides keeping track of all unique words, this object provides extra functionality, such as
sorting words by frequency, or discarding extremely rare words.
trainables : :class:`~gensim.models.doc2vec.Doc2VecTrainables`
This object represents the inner shallow neural network used to train the embeddings. The semantics
of the network differ slightly in the two available training modes (CBOW or SG) but you can think
of it as a NN with a single projection and hidden layer which we train on the corpus. The weights are
then used as our embeddings. The only addition to the underlying NN used in
:class:`~gensim.models.word2vec.Word2Vec` is that the input includes not only the word vectors of
each word in the context, but also the paragraph vector.
"""
super(Doc2Vec, self).__init__(
sg=(1 + dm) % 2,
Expand Down Expand Up @@ -962,13 +960,10 @@ def similarity_unseen_docs(self, doc_words1, doc_words2, alpha=None, min_alpha=N


class Doc2VecVocab(Word2VecVocab):
"""Vocabulary used by :class:`~gensim.models.doc2vec.Doc2Vec`.
This includes a mapping from words found in the corpus to their total frequency count.
"""
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75):
"""
"""Vocabulary used by :class:`~gensim.models.doc2vec.Doc2Vec`.
This includes a mapping from words found in the corpus to their total frequency count.
Parameters
----------
Expand Down Expand Up @@ -1112,8 +1107,8 @@ def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_pe


class Doc2VecTrainables(Word2VecTrainables):
"""Represents the inner shallow neural network used to train :class:`~gensim.models.doc2vec.Doc2Vec`."""
def __init__(self, dm=1, dm_concat=0, dm_tag_count=1, vector_size=100, seed=1, hashfxn=hash, window=5):
"""Represents the inner shallow neural network used to train :class:`~gensim.models.doc2vec.Doc2Vec`."""
super(Doc2VecTrainables, self).__init__(
vector_size=vector_size, seed=seed, hashfxn=hashfxn)
if dm and dm_concat:
Expand Down Expand Up @@ -1145,10 +1140,8 @@ def reset_doc_weights(self, docvecs):


class TaggedBrownCorpus(object):
"""Reader for the `Brown corpus (part of NLTK data) <http://www.nltk.org/book/ch02.html#tab-brown-sources>`_."""

def __init__(self, dirname):
"""
"""Reader for the `Brown corpus (part of NLTK data) <http://www.nltk.org/book/ch02.html#tab-brown-sources>`_.
Parameters
----------
Expand Down Expand Up @@ -1185,14 +1178,11 @@ def __iter__(self):


class TaggedLineDocument(object):
"""Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.
Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
automatically from the document line number (each document gets a unique integer tag).
"""
def __init__(self, source):
"""
"""Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.
Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
automatically from the document line number (each document gets a unique integer tag).
Parameters
----------
Expand Down
Loading

0 comments on commit 7cf2853

Please sign in to comment.