piskvorky · mpenkov · Oct 25, 2019 · Oct 12, 2019 · Oct 12, 2019 · Oct 12, 2019
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -29,6 +29,8 @@ jobs:
 
       - run:
           name: Build documentation
+          environment:
+            TOX_PARALLEL_NO_SPINNER: 1
           command: |
             source venv/bin/activate
             tox -e compile,docs -vv

diff --git a/.travis.yml b/.travis.yml
@@ -8,6 +8,8 @@ cache:
   - $HOME/.pip-cache
 dist: trusty
 language: python
+env:
+  TOX_PARALLEL_NO_SPINNER: 1
 
 
 matrix:

diff --git a/appveyor.yml b/appveyor.yml
@@ -11,6 +11,7 @@ environment:
     WHEELHOUSE_UPLOADER_USERNAME: "Lev.Konstantinovskiy"
     WHEELHOUSE_UPLOADER_SECRET:
       secure: qXqY3dFmLOqvxa3Om2gQi/BjotTOK+EP2IPLolBNo0c61yDtNWxbmE4wH3up72Be
+    TOX_PARALLEL_NO_SPINNER: 1
 
   matrix:
     - PYTHON: "C:\\Python35-x64"

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -24,7 +24,7 @@
 from scipy.linalg.special_matrices import triu
 from scipy.special import psi  # gamma function utils
 
-from six import iteritems, itervalues, string_types
+from six import iteritems, itervalues
 from six.moves import zip, range
 
 
@@ -1424,183 +1424,6 @@ def close(self):
 
 try:
     # try to load fast, cythonized code if possible
-    from gensim.corpora._mmreader import MmReader
+    from gensim.corpora._mmreader import MmReader  # noqa
 except ImportError:
-    FAST_VERSION = -1
-
-    class MmReader(object):
-        """Matrix market file reader, used internally in :class:`~gensim.corpora.mmcorpus.MmCorpus`.
-
-        Wrap a term-document matrix on disk (in matrix-market format), and present it
-        as an object which supports iteration over the rows (~documents).
-
-        Attributes
-        ----------
-        num_docs : int
-            Number of documents in market matrix file.
-        num_terms : int
-            Number of terms.
-        num_nnz : int
-            Number of non-zero terms.
-
-        Notes
-        -----
-        Note that the file is read into memory one document at a time, not the whole matrix at once
-        (unlike e.g. `scipy.io.mmread` and other implementations).
-        This allows us to process corpora which are larger than the available RAM.
-
-        """
-        def __init__(self, input, transposed=True):
-            """
-
-            Parameters
-            ----------
-            input : {str, file-like object}
-                Path to the input file in MM format or a file-like object that supports `seek()`
-                (e.g. smart_open objects).
-            transposed : bool, optional
-                Do lines represent `doc_id, term_id, value`, instead of `term_id, doc_id, value`?
-
-            """
-            logger.info("initializing corpus reader from %s", input)
-            self.input, self.transposed = input, transposed
-            with utils.open_file(self.input) as lines:
-                try:
-                    header = utils.to_unicode(next(lines)).strip()
-                    if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
-                        raise ValueError(
-                            "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
-                            (self.input, header)
-                        )
-                except StopIteration:
-                    pass
-
-                self.num_docs = self.num_terms = self.num_nnz = 0
-                for lineno, line in enumerate(lines):
-                    line = utils.to_unicode(line)
-                    if not line.startswith('%'):
-                        self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
-                        if not self.transposed:
-                            self.num_docs, self.num_terms = self.num_terms, self.num_docs
-                        break
-
-            logger.info(
-                "accepted corpus with %i documents, %i features, %i non-zero entries",
-                self.num_docs, self.num_terms, self.num_nnz
-            )
-
-        def __len__(self):
-            """Get the corpus size: total number of documents."""
-            return self.num_docs
-
-        def __str__(self):
-            return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
-                    (self.num_docs, self.num_terms, self.num_nnz))
-
-        def skip_headers(self, input_file):
-            """Skip file headers that appear before the first document.
-
-            Parameters
-            ----------
-            input_file : iterable of str
-                Iterable taken from file in MM format.
-
-            """
-            for line in input_file:
-                if line.startswith(b'%'):
-                    continue
-                break
-
-        def __iter__(self):
-            """Iterate through all documents in the corpus.
-
-            Notes
-            ------
-            Note that the total number of vectors returned is always equal to the number of rows specified
-            in the header: empty documents are inserted and yielded where appropriate, even if they are not explicitly
-            stored in the Matrix Market file.
-
-            Yields
-            ------
-            (int, list of (int, number))
-                Document id and document in sparse bag-of-words format.
-
-            """
-            with utils.file_or_filename(self.input) as lines:
-                self.skip_headers(lines)
-
-                previd = -1
-                for line in lines:
-                    docid, termid, val = utils.to_unicode(line).split()  # needed for python3
-                    if not self.transposed:
-                        termid, docid = docid, termid
-                    # -1 because matrix market indexes are 1-based => convert to 0-based
-                    docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)
-                    assert previd <= docid, "matrix columns must come in ascending order"
-                    if docid != previd:
-                        # change of document: return the document read so far (its id is prevId)
-                        if previd >= 0:
-                            yield previd, document  # noqa:F821
-
-                        # return implicit (empty) documents between previous id and new id
-                        # too, to keep consistent document numbering and corpus length
-                        for previd in range(previd + 1, docid):
-                            yield previd, []
-
-                        # from now on start adding fields to a new document, with a new id
-                        previd = docid
-                        document = []
-
-                    document.append((termid, val,))  # add another field to the current document
-
-            # handle the last document, as a special case
-            if previd >= 0:
-                yield previd, document
-
-            # return empty documents between the last explicit document and the number
-            # of documents as specified in the header
-            for previd in range(previd + 1, self.num_docs):
-                yield previd, []
-
-        def docbyoffset(self, offset):
-            """Get the document at file offset `offset` (in bytes).
-
-            Parameters
-            ----------
-            offset : int
-                File offset, in bytes, of the desired document.
-
-            Returns
-            ------
-            list of (int, str)
-                Document in sparse bag-of-words format.
-
-            """
-            # empty documents are not stored explicitly in MM format, so the index marks
-            # them with a special offset, -1.
-            if offset == -1:
-                return []
-            if isinstance(self.input, string_types):
-                fin, close_fin = utils.open(self.input, 'rb'), True
-            else:
-                fin, close_fin = self.input, False
-
-            fin.seek(offset)  # works for gzip/bz2 input, too
-            previd, document = -1, []
-            for line in fin:
-                docid, termid, val = line.split()
-                if not self.transposed:
-                    termid, docid = docid, termid
-                # -1 because matrix market indexes are 1-based => convert to 0-based
-                docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)
-                assert previd <= docid, "matrix columns must come in ascending order"
-                if docid != previd:
-                    if previd >= 0:
-                        break
-                    previd = docid
-
-                document.append((termid, val,))  # add another field to the current document
-
-            if close_fin:
-                fin.close()
-            return document
+    raise utils.NO_CYTHON
diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
@@ -39,10 +39,9 @@
 from six.moves import range
 from six import itervalues, string_types
 from gensim import matutils
-from numpy import float32 as REAL, ones, random, dtype, zeros
+from numpy import float32 as REAL, ones, random, dtype
 from types import GeneratorType
 from gensim.utils import deprecated
-import warnings
 import os
 import copy
 
@@ -647,7 +646,7 @@ def _set_train_params(self, **kwargs):
 
     def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100, epochs=5, callbacks=(),
                  batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5,
-                 ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs):
+                 ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, **kwargs):
         """
 
         Parameters
@@ -712,8 +711,6 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
         compute_loss : bool, optional
             If True, loss will be computed while training the Word2Vec model and stored in
             :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss` attribute.
-        fast_version : {-1, 1}, optional
-            Whether or not the fast cython implementation of the internal training methods is available. 1 means it is.
         **kwargs : object
             Key word arguments needed to allow children classes to accept more arguments.
 
@@ -738,17 +735,6 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
         super(BaseWordEmbeddingsModel, self).__init__(
             workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, batch_words=batch_words)
 
-        if fast_version < 0:
-            warnings.warn(
-                "C extension not loaded, training will be slow. "
-                "Install a C compiler and reinstall gensim for fast training."
-            )
-            self.neg_labels = []
-            if self.negative > 0:
-                # precompute negative labels optimization for pure-python training
-                self.neg_labels = zeros(self.negative + 1)
-                self.neg_labels[0] = 1.
-
         if sentences is not None or corpus_file is not None:
             self._check_input_data_sanity(data_iterable=sentences, corpus_file=corpus_file)
             if corpus_file is not None and not isinstance(corpus_file, string_types):

diff --git a/gensim/models/deprecated/fasttext.py b/gensim/models/deprecated/fasttext.py
@@ -45,7 +45,6 @@
 
 logger = logging.getLogger(__name__)
 
-FAST_VERSION = -1
 MAX_WORDS_IN_BATCH = 10000
 
 

diff --git a/gensim/models/deprecated/word2vec.py b/gensim/models/deprecated/word2vec.py
@@ -162,9 +162,6 @@
 
 logger = logging.getLogger(__name__)
 
-
-# failed... fall back to plain numpy (20-80x slower training than the above)
-FAST_VERSION = -1
 MAX_WORDS_IN_BATCH = 10000
 
 
@@ -588,11 +585,6 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
 
         self.load = call_on_class_only
 
-        if FAST_VERSION == -1:
-            logger.warning('Slow version of %s is being used', __name__)
-        else:
-            logger.debug('Fast version of %s is being used', __name__)
-
         self.initialize_word_vectors()
         self.sg = int(sg)
         self.cum_table = None  # for negative sampling
@@ -1007,16 +999,6 @@ def train(self, sentences, total_examples=None, total_words=None,
         """
         if self.model_trimmed_post_training:
             raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
-        if FAST_VERSION < 0:
-            warnings.warn(
-                "C extension not loaded for Word2Vec, training will be slow. "
-                "Install a C compiler and reinstall gensim for fast training."
-            )
-            self.neg_labels = []
-            if self.negative > 0:
-                # precompute negative labels optimization for pure-python training
-                self.neg_labels = zeros(self.negative + 1)
-                self.neg_labels[0] = 1.
 
         if compute_loss:
             self.compute_loss = compute_loss
@@ -1234,12 +1216,6 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
         .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb
 
         """
-        if FAST_VERSION < 0:
-            warnings.warn(
-                "C extension compilation failed, scoring will be slow. "
-                "Install a C compiler and reinstall gensim for fastness."
-            )
-
         logger.info(
             "scoring sentences with %i workers on %i vocabulary and %i features, "
             "using sg=%s hs=%s sample=%s and negative=%s",
@@ -1852,7 +1828,6 @@ def __iter__(self):
         level=logging.INFO
     )
     logger.info("running %s", " ".join(sys.argv))
-    logger.info("using optimization %s", FAST_VERSION)
 
     # check and process cmdline input
     program = os.path.basename(sys.argv[0])
Original file line number	Diff line number	Diff line change
Expand Up		@@ -45,7 +45,6 @@

		logger = logging.getLogger(__name__)

		FAST_VERSION = -1
		MAX_WORDS_IN_BATCH = 10000


Expand Down