Skip to content

Commit

Permalink
Fix smart_open deprecation warning globally
Browse files Browse the repository at this point in the history
  • Loading branch information
itayB committed Jun 21, 2019
1 parent ae7a8d7 commit 9b8a35c
Show file tree
Hide file tree
Showing 42 changed files with 263 additions and 229 deletions.
196 changes: 115 additions & 81 deletions gensim/corpora/_mmreader.c

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion gensim/corpora/_mmreader.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ cdef class MmReader(object):
if offset == -1:
return []
if isinstance(self.input, string_types):
fin, close_fin = utils.smart_open(self.input), True
fin, close_fin = utils.open(self.input, 'rb'), True
else:
fin, close_fin = self.input, False

Expand Down
10 changes: 5 additions & 5 deletions gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(self, fname, fname_vocab=None):
raise IOError('BleiCorpus: could not find vocabulary file')

self.fname = fname
with utils.smart_open(fname_vocab) as fin:
with utils.open(fname_vocab, 'rb') as fin:
words = [utils.to_unicode(word).rstrip() for word in fin]
self.id2word = dict(enumerate(words))

Expand All @@ -88,7 +88,7 @@ def __iter__(self):
"""
lineno = -1
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
for lineno, line in enumerate(fin):
yield self.line2doc(line)
self.length = lineno + 1
Expand Down Expand Up @@ -149,7 +149,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
num_terms = 0

logger.info("storing corpus in Blei's LDA-C format into %s", fname)
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
offsets = []
for doc in corpus:
doc = list(doc)
Expand All @@ -160,7 +160,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
# write out vocabulary, in a format compatible with Blei's topics.py script
fname_vocab = utils.smart_extension(fname, '.vocab')
logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
with utils.smart_open(fname_vocab, 'wb') as fout:
with utils.open(fname_vocab, 'wb') as fout:
for featureid in range(num_terms):
fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

Expand All @@ -181,6 +181,6 @@ def docbyoffset(self, offset):
Document in BoW format.
"""
with utils.smart_open(self.fname) as f:
with utils.open(self.fname, 'rb') as f:
f.seek(offset)
return self.line2doc(f.readline())
4 changes: 2 additions & 2 deletions gensim/corpora/csvcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, fname, labels):
self.labels = labels

# load the first few lines, to guess the CSV dialect
head = ''.join(itertools.islice(utils.smart_open(self.fname), 5))
head = ''.join(itertools.islice(utils.open(self.fname, 'rb'), 5))
self.headers = csv.Sniffer().has_header(head)
self.dialect = csv.Sniffer().sniff(head)
logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)
Expand All @@ -59,7 +59,7 @@ def __iter__(self):
Document in BoW format.
"""
reader = csv.reader(utils.smart_open(self.fname), self.dialect)
reader = csv.reader(utils.open(self.fname, 'rb'), self.dialect)
if self.headers:
next(reader) # skip the headers

Expand Down
4 changes: 2 additions & 2 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ def save_as_text(self, fname, sort_by_word=True):
"""
logger.info("saving dictionary mapping to %s", fname)
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
numdocs_line = "%d\n" % self.num_docs
fout.write(utils.to_utf8(numdocs_line))
if sort_by_word:
Expand Down Expand Up @@ -669,7 +669,7 @@ def load_from_text(fname):
"""
result = Dictionary()
with utils.smart_open(fname) as f:
with utils.open(fname, 'rb') as f:
for lineno, line in enumerate(f):
line = utils.to_unicode(line)
if lineno == 0:
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/hashdictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def save_as_text(self, fname):
"""
logger.info("saving %s mapping to %s" % (self, fname))
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
for tokenid in self.keys():
words = sorted(self[tokenid])
if words:
Expand Down
8 changes: 4 additions & 4 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _calculate_num_docs(self):
"""
# the first line in input data is the number of documents (integer). throws exception on bad input.
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
try:
result = int(next(fin))
except StopIteration:
Expand Down Expand Up @@ -191,7 +191,7 @@ def __iter__(self):
Document in BoW format.
"""
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
for lineno, line in enumerate(fin):
if lineno > 0: # ignore the first line = number of documents
yield self.line2doc(line)
Expand Down Expand Up @@ -231,7 +231,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
logger.info("storing corpus in List-Of-Words format into %s" % fname)
truncated = 0
offsets = []
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
fout.write(utils.to_utf8('%i\n' % len(corpus)))
for doc in corpus:
words = []
Expand Down Expand Up @@ -277,7 +277,7 @@ def docbyoffset(self, offset):
[(0, 1), (3, 1), (4, 1)]
"""
with utils.smart_open(self.fname) as f:
with utils.open(self.fname, 'rb') as f:
f.seek(offset)
return self.line2doc(f.readline())

Expand Down
8 changes: 4 additions & 4 deletions gensim/corpora/malletcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def _calculate_num_docs(self):
Number of documents in file.
"""
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
result = sum(1 for _ in fin)
return result

Expand All @@ -96,7 +96,7 @@ def __iter__(self):
Document in BoW format (+"document_id" and "lang" if metadata=True).
"""
with utils.smart_open(self.fname) as f:
with utils.open(self.fname, 'rb') as f:
for line in f:
yield self.line2doc(line)

Expand Down Expand Up @@ -180,7 +180,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):

truncated = 0
offsets = []
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
for doc_id, doc in enumerate(corpus):
if metadata:
doc_id, doc_lang = doc[1]
Expand Down Expand Up @@ -231,6 +231,6 @@ def docbyoffset(self, offset):
[(4, 1)]
"""
with utils.smart_open(self.fname) as f:
with utils.open(self.fname, 'rb') as f:
f.seek(offset)
return self.line2doc(f.readline())
6 changes: 3 additions & 3 deletions gensim/corpora/svmlightcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __iter__(self):
"""
lineno = -1
self.labels = []
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
for lineno, line in enumerate(fin):
doc = self.line2doc(line)
if doc is not None:
Expand Down Expand Up @@ -115,7 +115,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
# Cast any sequence (incl. a numpy array) to a list, to simplify the processing below.
labels = list(labels)
offsets = []
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
for docno, doc in enumerate(corpus):
label = labels[docno] if labels else 0 # target class is 0 by default
offsets.append(fout.tell())
Expand All @@ -135,7 +135,7 @@ def docbyoffset(self, offset):
tuple of (int, float)
"""
with utils.smart_open(self.fname) as f:
with utils.open(self.fname, 'rb') as f:
f.seek(offset)
return self.line2doc(f.readline())[0]
# TODO: it brakes if gets None from line2doc
Expand Down
6 changes: 3 additions & 3 deletions gensim/corpora/ucicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(self, input):

self.input = input

with utils.smart_open(self.input) as fin:
with utils.open(self.input, 'rb') as fin:
self.num_docs = self.num_terms = self.num_nnz = 0
try:
self.num_docs = int(next(fin).strip())
Expand Down Expand Up @@ -188,7 +188,7 @@ def __init__(self, fname, fname_vocab=None):
fname_vocab = utils.smart_extension(fname, '.vocab')

self.fname = fname
with utils.smart_open(fname_vocab) as fin:
with utils.open(fname_vocab, 'rb') as fin:
words = [word.strip() for word in fin]
self.id2word = dict(enumerate(words))

Expand Down Expand Up @@ -286,7 +286,7 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False)
# write out vocabulary
fname_vocab = utils.smart_extension(fname, '.vocab')
logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
with utils.smart_open(fname_vocab, 'wb') as fout:
with utils.open(fname_vocab, 'wb') as fout:
for featureid in range(num_terms):
fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

Expand Down
4 changes: 2 additions & 2 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1236,7 +1236,7 @@ def __init__(self, fname):
self.fname = fname
if fname.endswith(".gz") or fname.endswith('.bz2'):
raise NotImplementedError("compressed output not supported with MmWriter")
self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing
self.fout = utils.open(self.fname, 'wb+') # open for both reading and writing
self.headers_written = False

def write_headers(self, num_docs, num_terms, num_nnz):
Expand Down Expand Up @@ -1574,7 +1574,7 @@ def docbyoffset(self, offset):
if offset == -1:
return []
if isinstance(self.input, string_types):
fin, close_fin = utils.smart_open(self.input), True
fin, close_fin = utils.open(self.input, 'rb'), True
else:
fin, close_fin = self.input, False

Expand Down
6 changes: 3 additions & 3 deletions gensim/models/deprecated/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,7 +965,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec)
# save document vectors
if doctag_vec:
with utils.smart_open(fname, 'ab') as fout:
with utils.open(fname, 'ab') as fout:
if not word_vec:
total_vec = len(self.docvecs)
logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname)
Expand All @@ -992,7 +992,7 @@ def __iter__(self):
fname = os.path.join(self.dirname, fname)
if not os.path.isfile(fname):
continue
for item_no, line in enumerate(utils.smart_open(fname)):
for item_no, line in enumerate(utils.open(fname, 'rb')):
line = utils.to_unicode(line)
# each file line is a single document in the Brown corpus
# each token is WORD/POS_TAG
Expand Down Expand Up @@ -1036,6 +1036,6 @@ def __iter__(self):
yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
except AttributeError:
# If it didn't work like a file, use it as a string filename
with utils.smart_open(self.source) as fin:
with utils.open(self.source, 'rb') as fin:
for item_no, line in enumerate(fin):
yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
12 changes: 6 additions & 6 deletions gensim/models/deprecated/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,12 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None)
vector_size = self.syn0.shape[1]
if fvocab is not None:
logger.info("storing vocabulary in %s", fvocab)
with utils.smart_open(fvocab, 'wb') as vout:
with utils.open(fvocab, 'wb') as vout:
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
assert (len(self.vocab), vector_size) == self.syn0.shape
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
# store in sorted order: most frequent words at the top
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
Expand Down Expand Up @@ -204,13 +204,13 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8',
if fvocab is not None:
logger.info("loading word counts from %s", fvocab)
counts = {}
with utils.smart_open(fvocab) as fin:
with utils.open(fvocab, 'rb') as fin:
for line in fin:
word, count = utils.to_unicode(line).strip().split()
counts[word] = int(count)

logger.info("loading projection weights from %s", fname)
with utils.smart_open(fname) as fin:
with utils.open(fname, 'rb') as fin:
header = utils.to_unicode(fin.readline(), encoding=encoding)
vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
if limit:
Expand Down Expand Up @@ -934,7 +934,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)

sections, section = [], None
for line_no, line in enumerate(utils.smart_open(questions)):
for line_no, line in enumerate(utils.open(questions, 'rb')):
# TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
line = utils.to_unicode(line)
if line.startswith(': '):
Expand Down Expand Up @@ -1030,7 +1030,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000,
original_vocab = self.vocab
self.vocab = ok_vocab

for line_no, line in enumerate(utils.smart_open(pairs)):
for line_no, line in enumerate(utils.open(pairs, 'rb')):
line = utils.to_unicode(line)
if line.startswith('#'):
# May be a comment
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/deprecated/old_saveload.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

from six import iteritems

from smart_open import smart_open
from smart_open import open

if sys.version_info[0] >= 3:
unicode = str
Expand Down Expand Up @@ -367,7 +367,7 @@ def unpickle(fname):
Python object loaded from `fname`.
"""
with smart_open(fname, 'rb') as f:
with open(fname, 'rb') as f:
# Because of loading from S3 load can't be used (missing readline in smart_open)
file_bytes = f.read()
file_bytes = file_bytes.replace(b'gensim.models.word2vec', b'gensim.models.deprecated.word2vec')
Expand Down Expand Up @@ -395,5 +395,5 @@ def pickle(obj, fname, protocol=2):
Pickle protocol number, default is 2 to support compatible across python 2.x and 3.x.
"""
with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows
with open(fname, 'wb') as fout: # 'b' for binary, needed on Windows
_pickle.dump(obj, fout, protocol=protocol)
10 changes: 5 additions & 5 deletions gensim/models/deprecated/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -1413,7 +1413,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
"""
overlap_count = 0
logger.info("loading projection weights from %s", fname)
with utils.smart_open(fname) as fin:
with utils.open(fname, 'rb') as fin:
header = utils.to_unicode(fin.readline(), encoding=encoding)
vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
if not vector_size == self.vector_size:
Expand Down Expand Up @@ -1699,7 +1699,7 @@ def __iter__(self):
fname = os.path.join(self.dirname, fname)
if not os.path.isfile(fname):
continue
for line in utils.smart_open(fname):
for line in utils.open(fname, 'rb'):
line = utils.to_unicode(line)
# each file line is a single sentence in the Brown corpus
# each token is WORD/POS_TAG
Expand All @@ -1722,7 +1722,7 @@ def __iter__(self):
# the entire corpus is one gigantic line -- there are no sentence marks at all
# so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
sentence, rest = [], b''
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
while True:
text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM
if text == rest: # EOF
Expand Down Expand Up @@ -1778,7 +1778,7 @@ def __iter__(self):
i += self.max_sentence_length
except AttributeError:
# If it didn't work like a file, use it as a string filename
with utils.smart_open(self.source) as fin:
with utils.open(self.source, 'rb') as fin:
for line in itertools.islice(fin, self.limit):
line = utils.to_unicode(line).split()
i = 0
Expand Down Expand Up @@ -1833,7 +1833,7 @@ def __iter__(self):
"""iterate through the files"""
for file_name in self.input_files:
logger.info('reading file %s', file_name)
with utils.smart_open(file_name) as fin:
with utils.open(file_name, 'rb') as fin:
for line in itertools.islice(fin, self.limit):
line = utils.to_unicode(line).split()
i = 0
Expand Down
Loading

0 comments on commit 9b8a35c

Please sign in to comment.