Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

default isort && default black #3322

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions continuous_integration/check_wheels.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,30 @@

import requests


def to_int(value):
value = ''.join((x for x in value if x.isdigit()))
value = "".join((x for x in value if x.isdigit()))
try:
return int(value)
except Exception:
return 0


def to_tuple(version):
return tuple(to_int(x) for x in version.split('.'))
return tuple(to_int(x) for x in version.split("."))


def main():
project = sys.argv[1]
json = requests.get('https://pypi.org/pypi/%s/json' % project).json()
for version in sorted(json['releases'], key=to_tuple):
json = requests.get("https://pypi.org/pypi/%s/json" % project).json()
for version in sorted(json["releases"], key=to_tuple):
print(version)
wheel_packages = [
p for p in json['releases'][version]
if p['packagetype'] == 'bdist_wheel'
p for p in json["releases"][version] if p["packagetype"] == "bdist_wheel"
]
for p in wheel_packages:
print(' %(python_version)s %(filename)s' % p)
print(" %(python_version)s %(filename)s" % p)


if __name__ == '__main__':
if __name__ == "__main__":
main()
7 changes: 3 additions & 4 deletions continuous_integration/install_wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@
import subprocess

curr_dir = os.path.dirname(__file__)
dist_path = os.path.join(curr_dir, '..', 'dist')
dist_path = os.path.join(curr_dir, "..", "dist")
wheels = [
os.path.join(dist_path, f)
for f in os.listdir(dist_path) if f.endswith('.whl')
os.path.join(dist_path, f) for f in os.listdir(dist_path) if f.endswith(".whl")
]
assert len(wheels) == 1, "wheels = %r" % wheels

command = 'pip install --pre --force-reinstall'.split() + [wheels[0]]
command = "pip install --pre --force-reinstall".split() + [wheels[0]]
subprocess.check_call(command)
6 changes: 4 additions & 2 deletions continuous_integration/upgrade_pip_py310.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import subprocess
import sys
import tempfile
if sys.platform in ('linux', 'darwin') and sys.version_info[:2] == (3, 10):

if sys.platform in ("linux", "darwin") and sys.version_info[:2] == (3, 10):
import urllib.request
with tempfile.NamedTemporaryFile(suffix='.py') as fout:

with tempfile.NamedTemporaryFile(suffix=".py") as fout:
urllib.request.urlretrieve("https://bootstrap.pypa.io/get-pip.py", fout.name)
subprocess.call([sys.executable, fout.name])
13 changes: 7 additions & 6 deletions docs/notebooks/test_notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,25 @@
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.preprocessors.execute import CellExecutionError

"from smart_open import smart_open\n",


def _notebook_run(path):
"""Execute a notebook via nbconvert and collect output.
:returns (parsed nb object, execution errors)
:returns (parsed nb object, execution errors)
"""
kernel_name = 'python%d' % sys.version_info[0]
kernel_name = "python%d" % sys.version_info[0]
this_file_directory = os.path.dirname(__file__)
errors = []
with tempfile.NamedTemporaryFile(suffix=".ipynb", mode='wt') as fout:
with smart_open(path, 'rb') as f:
with tempfile.NamedTemporaryFile(suffix=".ipynb", mode="wt") as fout:
with smart_open(path, "rb") as f:
nb = nbformat.read(f, as_version=4)
nb.metadata.get('kernelspec', {})['name'] = kernel_name
nb.metadata.get("kernelspec", {})["name"] = kernel_name
ep = ExecutePreprocessor(kernel_name=kernel_name, timeout=10)

try:
ep.preprocess(nb, {'metadata': {'path': this_file_directory}})
ep.preprocess(nb, {"metadata": {"path": this_file_directory}})
except CellExecutionError as e:
if "SKIP" in e.traceback:
print(str(e.traceback).split("\n")[-2])
Expand Down
18 changes: 11 additions & 7 deletions docs/src/auto_examples/core/run_core_concepts.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,16 @@
#

# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
stoplist = set("for a of the and to in".split(" "))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in text_corpus]
texts = [
[word for word in document.lower().split() if word not in stoplist]
for document in text_corpus
]

# Count word frequencies
from collections import defaultdict

frequency = defaultdict(int)
for text in texts:
for token in text:
Expand Down Expand Up @@ -288,7 +291,7 @@

###############################################################################
# and to query the similarity of our query document ``query_document`` against every document in the corpus:
query_document = 'system engineering'.split()
query_document = "system engineering".split()
query_bow = dictionary.doc2bow(query_document)
sims = index[tfidf[query_bow]]
print(list(enumerate(sims)))
Expand Down Expand Up @@ -323,8 +326,9 @@
#
# There's still much more to learn about :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py`.

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
img = mpimg.imread('run_core_concepts.png')
import matplotlib.pyplot as plt

img = mpimg.imread("run_core_concepts.png")
imgplot = plt.imshow(img)
_ = plt.axis('off')
_ = plt.axis("off")
58 changes: 35 additions & 23 deletions docs/src/auto_examples/core/run_corpora_and_vector_spaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
"""

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

logging.basicConfig(
format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)

###############################################################################
# First, let’s create a small corpus of nine short documents [1]_:
Expand Down Expand Up @@ -38,11 +41,11 @@
# First, let's tokenize the documents, remove common words (using a toy stoplist)
# as well as words that only appear once in the corpus:

from pprint import pprint # pretty-printer
from collections import defaultdict
from pprint import pprint # pretty-printer

# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
stoplist = set("for a of the and to in".split())
texts = [
[word for word in document.lower().split() if word not in stoplist]
for document in documents
Expand All @@ -54,10 +57,7 @@
for token in text:
frequency[token] += 1

texts = [
[token for token in text if frequency[token] > 1]
for text in texts
]
texts = [[token for token in text if frequency[token] > 1] for text in texts]

pprint(texts)

Expand Down Expand Up @@ -86,8 +86,9 @@
# between the questions and ids is called a dictionary:

from gensim import corpora

dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference
dictionary.save("/tmp/deerwester.dict") # store the dictionary, for future reference
print(dictionary)

###############################################################################
Expand All @@ -104,7 +105,9 @@

new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec) # the word "interaction" does not appear in the dictionary and is ignored
print(
new_vec
) # the word "interaction" does not appear in the dictionary and is ignored

###############################################################################
# The function :func:`doc2bow` simply counts the number of occurrences of
Expand All @@ -114,7 +117,7 @@
# (id 0) and `human` (id 1) appear once; the other ten dictionary words appear (implicitly) zero times.

corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use
corpora.MmCorpus.serialize("/tmp/deerwester.mm", corpus) # store to disk, for later use
print(corpus)

###############################################################################
Expand All @@ -138,10 +141,11 @@

class MyCorpus:
def __iter__(self):
for line in open('https://radimrehurek.com/mycorpus.txt'):
for line in open("https://radimrehurek.com/mycorpus.txt"):
# assume there's one document per line, tokens separated by whitespace
yield dictionary.doc2bow(line.lower().split())


###############################################################################
# The full power of Gensim comes from the fact that a corpus doesn't have to be
# a ``list``, or a ``NumPy`` array, or a ``Pandas`` dataframe, or whatever.
Expand Down Expand Up @@ -180,15 +184,19 @@ def __iter__(self):
# Similarly, to construct the dictionary without loading all texts into memory:

# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/mycorpus.txt'))
dictionary = corpora.Dictionary(
line.lower().split() for line in open("https://radimrehurek.com/mycorpus.txt")
)
# remove stop words and words that appear only once
stop_ids = [
dictionary.token2id[stopword]
for stopword in stoplist
if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.filter_tokens(
stop_ids + once_ids
) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)

Expand Down Expand Up @@ -218,22 +226,22 @@ def __iter__(self):
# create a toy corpus of 2 documents, as a plain Python list
corpus = [[(1, 0.5)], []] # make one document empty, for the heck of it

corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
corpora.MmCorpus.serialize("/tmp/corpus.mm", corpus)

###############################################################################
# Other formats include `Joachim's SVMlight format <http://svmlight.joachims.org/>`_,
# `Blei's LDA-C format <http://www.cs.princeton.edu/~blei/lda-c/>`_ and
# `GibbsLDA++ format <http://gibbslda.sourceforge.net/>`_.

corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
corpora.SvmLightCorpus.serialize("/tmp/corpus.svmlight", corpus)
corpora.BleiCorpus.serialize("/tmp/corpus.lda-c", corpus)
corpora.LowCorpus.serialize("/tmp/corpus.low", corpus)


###############################################################################
# Conversely, to load a corpus iterator from a Matrix Market file:

corpus = corpora.MmCorpus('/tmp/corpus.mm')
corpus = corpora.MmCorpus("/tmp/corpus.mm")

###############################################################################
# Corpus objects are streams, so typically you won't be able to print them directly:
Expand All @@ -259,7 +267,7 @@ def __iter__(self):
#
# To save the same Matrix Market document stream in Blei's LDA-C format,

corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpora.BleiCorpus.serialize("/tmp/corpus.lda-c", corpus)

###############################################################################
# In this way, `gensim` can also be used as a memory-efficient **I/O format conversion tool**:
Expand All @@ -273,8 +281,10 @@ def __iter__(self):
# Gensim also contains `efficient utility functions <http://radimrehurek.com/gensim/matutils.html>`_
# to help converting from/to numpy matrices

import gensim
import numpy as np

import gensim

numpy_matrix = np.random.randint(10, size=[5, 2]) # random matrix as an example
corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
# numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)
Expand All @@ -283,6 +293,7 @@ def __iter__(self):
# and from/to `scipy.sparse` matrices

import scipy.sparse

scipy_sparse_matrix = scipy.sparse.random(5, 2) # random sparse matrix as example
corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)
Expand All @@ -302,8 +313,9 @@ def __iter__(self):
# .. [1] This is the same corpus as used in
# `Deerwester et al. (1990): Indexing by Latent Semantic Analysis <http://www.cs.bham.ac.uk/~pxt/IDA/lsa_ind.pdf>`_, Table 2.

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
img = mpimg.imread('run_corpora_and_vector_spaces.png')
import matplotlib.pyplot as plt

img = mpimg.imread("run_corpora_and_vector_spaces.png")
imgplot = plt.imshow(img)
_ = plt.axis('off')
_ = plt.axis("off")
Loading