piskvorky · dariocurr · Apr 10, 2022
diff --git a/continuous_integration/check_wheels.py b/continuous_integration/check_wheels.py
@@ -9,30 +9,30 @@
 
 import requests
 
+
 def to_int(value):
-    value = ''.join((x for x in value if x.isdigit()))
+    value = "".join((x for x in value if x.isdigit()))
     try:
         return int(value)
     except Exception:
         return 0
 
 
 def to_tuple(version):
-    return tuple(to_int(x) for x in version.split('.'))
+    return tuple(to_int(x) for x in version.split("."))
 
 
 def main():
     project = sys.argv[1]
-    json = requests.get('https://pypi.org/pypi/%s/json' % project).json()
-    for version in sorted(json['releases'], key=to_tuple):
+    json = requests.get("https://pypi.org/pypi/%s/json" % project).json()
+    for version in sorted(json["releases"], key=to_tuple):
         print(version)
         wheel_packages = [
-            p for p in json['releases'][version]
-            if p['packagetype'] == 'bdist_wheel'
+            p for p in json["releases"][version] if p["packagetype"] == "bdist_wheel"
         ]
         for p in wheel_packages:
-            print('    %(python_version)s %(filename)s' % p)
+            print("    %(python_version)s %(filename)s" % p)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/continuous_integration/install_wheel.py b/continuous_integration/install_wheel.py
@@ -6,12 +6,11 @@
 import subprocess
 
 curr_dir = os.path.dirname(__file__)
-dist_path = os.path.join(curr_dir, '..', 'dist')
+dist_path = os.path.join(curr_dir, "..", "dist")
 wheels = [
-    os.path.join(dist_path, f)
-    for f in os.listdir(dist_path) if f.endswith('.whl')
+    os.path.join(dist_path, f) for f in os.listdir(dist_path) if f.endswith(".whl")
 ]
 assert len(wheels) == 1, "wheels = %r" % wheels
 
-command = 'pip install --pre --force-reinstall'.split() + [wheels[0]]
+command = "pip install --pre --force-reinstall".split() + [wheels[0]]
 subprocess.check_call(command)
diff --git a/continuous_integration/upgrade_pip_py310.py b/continuous_integration/upgrade_pip_py310.py
@@ -3,8 +3,10 @@
 import subprocess
 import sys
 import tempfile
-if sys.platform in ('linux', 'darwin') and sys.version_info[:2] == (3, 10):
+
+if sys.platform in ("linux", "darwin") and sys.version_info[:2] == (3, 10):
     import urllib.request
-    with tempfile.NamedTemporaryFile(suffix='.py') as fout:
+
+    with tempfile.NamedTemporaryFile(suffix=".py") as fout:
         urllib.request.urlretrieve("https://bootstrap.pypa.io/get-pip.py", fout.name)
         subprocess.call([sys.executable, fout.name])
diff --git a/docs/notebooks/test_notebooks.py b/docs/notebooks/test_notebooks.py
@@ -6,24 +6,25 @@
 import nbformat
 from nbconvert.preprocessors import ExecutePreprocessor
 from nbconvert.preprocessors.execute import CellExecutionError
+
 "from smart_open import smart_open\n",
 
 
 def _notebook_run(path):
     """Execute a notebook via nbconvert and collect output.
-       :returns (parsed nb object, execution errors)
+    :returns (parsed nb object, execution errors)
     """
-    kernel_name = 'python%d' % sys.version_info[0]
+    kernel_name = "python%d" % sys.version_info[0]
     this_file_directory = os.path.dirname(__file__)
     errors = []
-    with tempfile.NamedTemporaryFile(suffix=".ipynb", mode='wt') as fout:
-        with smart_open(path, 'rb') as f:
+    with tempfile.NamedTemporaryFile(suffix=".ipynb", mode="wt") as fout:
+        with smart_open(path, "rb") as f:
             nb = nbformat.read(f, as_version=4)
-            nb.metadata.get('kernelspec', {})['name'] = kernel_name
+            nb.metadata.get("kernelspec", {})["name"] = kernel_name
             ep = ExecutePreprocessor(kernel_name=kernel_name, timeout=10)
 
             try:
-                ep.preprocess(nb, {'metadata': {'path': this_file_directory}})
+                ep.preprocess(nb, {"metadata": {"path": this_file_directory}})
             except CellExecutionError as e:
                 if "SKIP" in e.traceback:
                     print(str(e.traceback).split("\n")[-2])

diff --git a/docs/src/auto_examples/core/run_core_concepts.py b/docs/src/auto_examples/core/run_core_concepts.py
@@ -94,13 +94,16 @@
 #
 
 # Create a set of frequent words
-stoplist = set('for a of the and to in'.split(' '))
+stoplist = set("for a of the and to in".split(" "))
 # Lowercase each document, split it by white space and filter out stopwords
-texts = [[word for word in document.lower().split() if word not in stoplist]
-         for document in text_corpus]
+texts = [
+    [word for word in document.lower().split() if word not in stoplist]
+    for document in text_corpus
+]
 
 # Count word frequencies
 from collections import defaultdict
+
 frequency = defaultdict(int)
 for text in texts:
     for token in text:
@@ -288,7 +291,7 @@
 
 ###############################################################################
 # and to query the similarity of our query document ``query_document`` against every document in the corpus:
-query_document = 'system engineering'.split()
+query_document = "system engineering".split()
 query_bow = dictionary.doc2bow(query_document)
 sims = index[tfidf[query_bow]]
 print(list(enumerate(sims)))
@@ -323,8 +326,9 @@
 #
 # There's still much more to learn about :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py`.
 
-import matplotlib.pyplot as plt
 import matplotlib.image as mpimg
-img = mpimg.imread('run_core_concepts.png')
+import matplotlib.pyplot as plt
+
+img = mpimg.imread("run_core_concepts.png")
 imgplot = plt.imshow(img)
-_ = plt.axis('off')
+_ = plt.axis("off")
diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py
@@ -8,7 +8,10 @@
 """
 
 import logging
-logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+
+logging.basicConfig(
+    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
+)
 
 ###############################################################################
 # First, let’s create a small corpus of nine short documents [1]_:
@@ -38,11 +41,11 @@
 # First, let's tokenize the documents, remove common words (using a toy stoplist)
 # as well as words that only appear once in the corpus:
 
-from pprint import pprint  # pretty-printer
 from collections import defaultdict
+from pprint import pprint  # pretty-printer
 
 # remove common words and tokenize
-stoplist = set('for a of the and to in'.split())
+stoplist = set("for a of the and to in".split())
 texts = [
     [word for word in document.lower().split() if word not in stoplist]
     for document in documents
@@ -54,10 +57,7 @@
     for token in text:
         frequency[token] += 1
 
-texts = [
-    [token for token in text if frequency[token] > 1]
-    for text in texts
-]
+texts = [[token for token in text if frequency[token] > 1] for text in texts]
 
 pprint(texts)
 
@@ -86,8 +86,9 @@
 # between the questions and ids is called a dictionary:
 
 from gensim import corpora
+
 dictionary = corpora.Dictionary(texts)
-dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
+dictionary.save("/tmp/deerwester.dict")  # store the dictionary, for future reference
 print(dictionary)
 
 ###############################################################################
@@ -104,7 +105,9 @@
 
 new_doc = "Human computer interaction"
 new_vec = dictionary.doc2bow(new_doc.lower().split())
-print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored
+print(
+    new_vec
+)  # the word "interaction" does not appear in the dictionary and is ignored
 
 ###############################################################################
 # The function :func:`doc2bow` simply counts the number of occurrences of
@@ -114,7 +117,7 @@
 # (id 0) and `human` (id 1) appear once; the other ten dictionary words appear (implicitly) zero times.
 
 corpus = [dictionary.doc2bow(text) for text in texts]
-corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
+corpora.MmCorpus.serialize("/tmp/deerwester.mm", corpus)  # store to disk, for later use
 print(corpus)
 
 ###############################################################################
@@ -138,10 +141,11 @@
 
 class MyCorpus:
     def __iter__(self):
-        for line in open('https://radimrehurek.com/mycorpus.txt'):
+        for line in open("https://radimrehurek.com/mycorpus.txt"):
             # assume there's one document per line, tokens separated by whitespace
             yield dictionary.doc2bow(line.lower().split())
 
+
 ###############################################################################
 # The full power of Gensim comes from the fact that a corpus doesn't have to be
 # a ``list``, or a ``NumPy`` array, or a ``Pandas`` dataframe, or whatever.
@@ -180,15 +184,19 @@ def __iter__(self):
 # Similarly, to construct the dictionary without loading all texts into memory:
 
 # collect statistics about all tokens
-dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/mycorpus.txt'))
+dictionary = corpora.Dictionary(
+    line.lower().split() for line in open("https://radimrehurek.com/mycorpus.txt")
+)
 # remove stop words and words that appear only once
 stop_ids = [
     dictionary.token2id[stopword]
     for stopword in stoplist
     if stopword in dictionary.token2id
 ]
 once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
-dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once
+dictionary.filter_tokens(
+    stop_ids + once_ids
+)  # remove stop words and words that appear only once
 dictionary.compactify()  # remove gaps in id sequence after words that were removed
 print(dictionary)
 
@@ -218,22 +226,22 @@ def __iter__(self):
 # create a toy corpus of 2 documents, as a plain Python list
 corpus = [[(1, 0.5)], []]  # make one document empty, for the heck of it
 
-corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
+corpora.MmCorpus.serialize("/tmp/corpus.mm", corpus)
 
 ###############################################################################
 # Other formats include `Joachim's SVMlight format <http://svmlight.joachims.org/>`_,
 # `Blei's LDA-C format <http://www.cs.princeton.edu/~blei/lda-c/>`_ and
 # `GibbsLDA++ format <http://gibbslda.sourceforge.net/>`_.
 
-corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
-corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
-corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
+corpora.SvmLightCorpus.serialize("/tmp/corpus.svmlight", corpus)
+corpora.BleiCorpus.serialize("/tmp/corpus.lda-c", corpus)
+corpora.LowCorpus.serialize("/tmp/corpus.low", corpus)
 
 
 ###############################################################################
 # Conversely, to load a corpus iterator from a Matrix Market file:
 
-corpus = corpora.MmCorpus('/tmp/corpus.mm')
+corpus = corpora.MmCorpus("/tmp/corpus.mm")
 
 ###############################################################################
 # Corpus objects are streams, so typically you won't be able to print them directly:
@@ -259,7 +267,7 @@ def __iter__(self):
 #
 # To save the same Matrix Market document stream in Blei's LDA-C format,
 
-corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
+corpora.BleiCorpus.serialize("/tmp/corpus.lda-c", corpus)
 
 ###############################################################################
 # In this way, `gensim` can also be used as a memory-efficient **I/O format conversion tool**:
@@ -273,8 +281,10 @@ def __iter__(self):
 # Gensim also contains `efficient utility functions <http://radimrehurek.com/gensim/matutils.html>`_
 # to help converting from/to numpy matrices
 
-import gensim
 import numpy as np
+
+import gensim
+
 numpy_matrix = np.random.randint(10, size=[5, 2])  # random matrix as an example
 corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
 # numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)
@@ -283,6 +293,7 @@ def __iter__(self):
 # and from/to `scipy.sparse` matrices
 
 import scipy.sparse
+
 scipy_sparse_matrix = scipy.sparse.random(5, 2)  # random sparse matrix as example
 corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
 scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)
@@ -302,8 +313,9 @@ def __iter__(self):
 # .. [1] This is the same corpus as used in
 #        `Deerwester et al. (1990): Indexing by Latent Semantic Analysis <http://www.cs.bham.ac.uk/~pxt/IDA/lsa_ind.pdf>`_, Table 2.
 
-import matplotlib.pyplot as plt
 import matplotlib.image as mpimg
-img = mpimg.imread('run_corpora_and_vector_spaces.png')
+import matplotlib.pyplot as plt
+
+img = mpimg.imread("run_corpora_and_vector_spaces.png")
 imgplot = plt.imshow(img)
-_ = plt.axis('off')
+_ = plt.axis("off")