Fix smart_open deprecation warning globally

piskvorky · Jun 15, 2019 · d2afdc2 · d2afdc2
1 parent ae7a8d7
commit d2afdc2
Show file tree

Hide file tree

Showing 41 changed files with 151 additions and 151 deletions.
diff --git a/gensim/corpora/_mmreader.pyx b/gensim/corpora/_mmreader.pyx
@@ -188,7 +188,7 @@ cdef class MmReader(object):
         if offset == -1:
             return []
         if isinstance(self.input, string_types):
-            fin, close_fin = utils.smart_open(self.input), True
+            fin, close_fin = utils.open(self.input, 'rb'), True
         else:
             fin, close_fin = self.input, False
 

diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
@@ -74,7 +74,7 @@ def __init__(self, fname, fname_vocab=None):
                 raise IOError('BleiCorpus: could not find vocabulary file')
 
         self.fname = fname
-        with utils.smart_open(fname_vocab) as fin:
+        with utils.open(fname_vocab, 'rb') as fin:
             words = [utils.to_unicode(word).rstrip() for word in fin]
         self.id2word = dict(enumerate(words))
 
@@ -88,7 +88,7 @@ def __iter__(self):
 
         """
         lineno = -1
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             for lineno, line in enumerate(fin):
                 yield self.line2doc(line)
         self.length = lineno + 1
@@ -149,7 +149,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
             num_terms = 0
 
         logger.info("storing corpus in Blei's LDA-C format into %s", fname)
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             offsets = []
             for doc in corpus:
                 doc = list(doc)
@@ -160,7 +160,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
         # write out vocabulary, in a format compatible with Blei's topics.py script
         fname_vocab = utils.smart_extension(fname, '.vocab')
         logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
-        with utils.smart_open(fname_vocab, 'wb') as fout:
+        with utils.open(fname_vocab, 'wb') as fout:
             for featureid in range(num_terms):
                 fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
 
@@ -181,6 +181,6 @@ def docbyoffset(self, offset):
             Document in BoW format.
 
         """
-        with utils.smart_open(self.fname) as f:
+        with utils.open(self.fname, 'rb') as f:
             f.seek(offset)
             return self.line2doc(f.readline())
diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
@@ -45,7 +45,7 @@ def __init__(self, fname, labels):
         self.labels = labels
 
         # load the first few lines, to guess the CSV dialect
-        head = ''.join(itertools.islice(utils.smart_open(self.fname), 5))
+        head = ''.join(itertools.islice(utils.open(self.fname, 'rb'), 5))
         self.headers = csv.Sniffer().has_header(head)
         self.dialect = csv.Sniffer().sniff(head)
         logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)
@@ -59,7 +59,7 @@ def __iter__(self):
             Document in BoW format.
 
         """
-        reader = csv.reader(utils.smart_open(self.fname), self.dialect)
+        reader = csv.reader(utils.open(self.fname, 'rb'), self.dialect)
         if self.headers:
             next(reader)    # skip the headers
 

diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -516,7 +516,7 @@ def save_as_text(self, fname, sort_by_word=True):
 
         """
         logger.info("saving dictionary mapping to %s", fname)
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             numdocs_line = "%d\n" % self.num_docs
             fout.write(utils.to_utf8(numdocs_line))
             if sort_by_word:
@@ -669,7 +669,7 @@ def load_from_text(fname):
 
         """
         result = Dictionary()
-        with utils.smart_open(fname) as f:
+        with utils.open(fname, 'rb') as f:
             for lineno, line in enumerate(f):
                 line = utils.to_unicode(line)
                 if lineno == 0:

diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py
@@ -341,7 +341,7 @@ def save_as_text(self, fname):
 
         """
         logger.info("saving %s mapping to %s" % (self, fname))
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             for tokenid in self.keys():
                 words = sorted(self[tokenid])
                 if words:

diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
@@ -131,7 +131,7 @@ def _calculate_num_docs(self):
 
         """
         # the first line in input data is the number of documents (integer). throws exception on bad input.
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             try:
                 result = int(next(fin))
             except StopIteration:
@@ -191,7 +191,7 @@ def __iter__(self):
             Document in BoW format.
 
         """
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             for lineno, line in enumerate(fin):
                 if lineno > 0:  # ignore the first line = number of documents
                     yield self.line2doc(line)
@@ -231,7 +231,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
         logger.info("storing corpus in List-Of-Words format into %s" % fname)
         truncated = 0
         offsets = []
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             fout.write(utils.to_utf8('%i\n' % len(corpus)))
             for doc in corpus:
                 words = []
@@ -277,7 +277,7 @@ def docbyoffset(self, offset):
             [(0, 1), (3, 1), (4, 1)]
 
         """
-        with utils.smart_open(self.fname) as f:
+        with utils.open(self.fname, 'rb') as f:
             f.seek(offset)
             return self.line2doc(f.readline())
 

diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
@@ -83,7 +83,7 @@ def _calculate_num_docs(self):
             Number of documents in file.
 
         """
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             result = sum(1 for _ in fin)
         return result
 
@@ -96,7 +96,7 @@ def __iter__(self):
             Document in BoW format (+"document_id" and "lang" if metadata=True).
 
         """
-        with utils.smart_open(self.fname) as f:
+        with utils.open(self.fname, 'rb') as f:
             for line in f:
                 yield self.line2doc(line)
 
@@ -180,7 +180,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
 
         truncated = 0
         offsets = []
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             for doc_id, doc in enumerate(corpus):
                 if metadata:
                     doc_id, doc_lang = doc[1]
@@ -231,6 +231,6 @@ def docbyoffset(self, offset):
             [(4, 1)]
 
         """
-        with utils.smart_open(self.fname) as f:
+        with utils.open(self.fname, 'rb') as f:
             f.seek(offset)
             return self.line2doc(f.readline())
diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py
@@ -74,7 +74,7 @@ def __iter__(self):
         """
         lineno = -1
         self.labels = []
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             for lineno, line in enumerate(fin):
                 doc = self.line2doc(line)
                 if doc is not None:
@@ -115,7 +115,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
             # Cast any sequence (incl. a numpy array) to a list, to simplify the processing below.
             labels = list(labels)
         offsets = []
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             for docno, doc in enumerate(corpus):
                 label = labels[docno] if labels else 0  # target class is 0 by default
                 offsets.append(fout.tell())
@@ -135,7 +135,7 @@ def docbyoffset(self, offset):
         tuple of (int, float)
 
         """
-        with utils.smart_open(self.fname) as f:
+        with utils.open(self.fname, 'rb') as f:
             f.seek(offset)
             return self.line2doc(f.readline())[0]
             # TODO: it brakes if gets None from line2doc

diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
@@ -39,7 +39,7 @@ def __init__(self, input):
 
         self.input = input
 
-        with utils.smart_open(self.input) as fin:
+        with utils.open(self.input, 'rb') as fin:
             self.num_docs = self.num_terms = self.num_nnz = 0
             try:
                 self.num_docs = int(next(fin).strip())
@@ -188,7 +188,7 @@ def __init__(self, fname, fname_vocab=None):
             fname_vocab = utils.smart_extension(fname, '.vocab')
 
         self.fname = fname
-        with utils.smart_open(fname_vocab) as fin:
+        with utils.open(fname_vocab, 'rb') as fin:
             words = [word.strip() for word in fin]
         self.id2word = dict(enumerate(words))
 
@@ -286,7 +286,7 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False)
         # write out vocabulary
         fname_vocab = utils.smart_extension(fname, '.vocab')
         logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
-        with utils.smart_open(fname_vocab, 'wb') as fout:
+        with utils.open(fname_vocab, 'wb') as fout:
             for featureid in range(num_terms):
                 fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
 

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -1236,7 +1236,7 @@ def __init__(self, fname):
         self.fname = fname
         if fname.endswith(".gz") or fname.endswith('.bz2'):
             raise NotImplementedError("compressed output not supported with MmWriter")
-        self.fout = utils.smart_open(self.fname, 'wb+')  # open for both reading and writing
+        self.fout = utils.open(self.fname, 'wb+')  # open for both reading and writing
         self.headers_written = False
 
     def write_headers(self, num_docs, num_terms, num_nnz):
@@ -1574,7 +1574,7 @@ def docbyoffset(self, offset):
             if offset == -1:
                 return []
             if isinstance(self.input, string_types):
-                fin, close_fin = utils.smart_open(self.input), True
+                fin, close_fin = utils.open(self.input, 'rb'), True
             else:
                 fin, close_fin = self.input, False
 

diff --git a/gensim/models/deprecated/doc2vec.py b/gensim/models/deprecated/doc2vec.py
@@ -965,7 +965,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
             KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec)
         # save document vectors
         if doctag_vec:
-            with utils.smart_open(fname, 'ab') as fout:
+            with utils.open(fname, 'ab') as fout:
                 if not word_vec:
                     total_vec = len(self.docvecs)
                     logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname)
@@ -992,7 +992,7 @@ def __iter__(self):
             fname = os.path.join(self.dirname, fname)
             if not os.path.isfile(fname):
                 continue
-            for item_no, line in enumerate(utils.smart_open(fname)):
+            for item_no, line in enumerate(utils.open(fname, 'rb')):
                 line = utils.to_unicode(line)
                 # each file line is a single document in the Brown corpus
                 # each token is WORD/POS_TAG
@@ -1036,6 +1036,6 @@ def __iter__(self):
                 yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
         except AttributeError:
             # If it didn't work like a file, use it as a string filename
-            with utils.smart_open(self.source) as fin:
+            with utils.open(self.source, 'rb') as fin:
                 for item_no, line in enumerate(fin):
                     yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
diff --git a/gensim/models/deprecated/keyedvectors.py b/gensim/models/deprecated/keyedvectors.py
@@ -154,12 +154,12 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None)
         vector_size = self.syn0.shape[1]
         if fvocab is not None:
             logger.info("storing vocabulary in %s", fvocab)
-            with utils.smart_open(fvocab, 'wb') as vout:
+            with utils.open(fvocab, 'wb') as vout:
                 for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                     vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
         logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
         assert (len(self.vocab), vector_size) == self.syn0.shape
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
             # store in sorted order: most frequent words at the top
             for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
@@ -204,13 +204,13 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8',
         if fvocab is not None:
             logger.info("loading word counts from %s", fvocab)
             counts = {}
-            with utils.smart_open(fvocab) as fin:
+            with utils.open(fvocab, 'rb') as fin:
                 for line in fin:
                     word, count = utils.to_unicode(line).strip().split()
                     counts[word] = int(count)
 
         logger.info("loading projection weights from %s", fname)
-        with utils.smart_open(fname) as fin:
+        with utils.open(fname, 'rb') as fin:
             header = utils.to_unicode(fin.readline(), encoding=encoding)
             vocab_size, vector_size = (int(x) for x in header.split())  # throws for invalid file format
             if limit:
@@ -934,7 +934,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
         ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
 
         sections, section = [], None
-        for line_no, line in enumerate(utils.smart_open(questions)):
+        for line_no, line in enumerate(utils.open(questions, 'rb')):
             # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
             line = utils.to_unicode(line)
             if line.startswith(': '):
@@ -1030,7 +1030,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000,
         original_vocab = self.vocab
         self.vocab = ok_vocab
 
-        for line_no, line in enumerate(utils.smart_open(pairs)):
+        for line_no, line in enumerate(utils.open(pairs, 'rb')):
             line = utils.to_unicode(line)
             if line.startswith('#'):
                 # May be a comment

diff --git a/gensim/models/deprecated/old_saveload.py b/gensim/models/deprecated/old_saveload.py
@@ -31,7 +31,7 @@
 
 from six import iteritems
 
-from smart_open import smart_open
+from smart_open import open
 
 if sys.version_info[0] >= 3:
     unicode = str
@@ -367,7 +367,7 @@ def unpickle(fname):
         Python object loaded from `fname`.
 
     """
-    with smart_open(fname, 'rb') as f:
+    with open(fname, 'rb') as f:
         # Because of loading from S3 load can't be used (missing readline in smart_open)
         file_bytes = f.read()
         file_bytes = file_bytes.replace(b'gensim.models.word2vec', b'gensim.models.deprecated.word2vec')
@@ -395,5 +395,5 @@ def pickle(obj, fname, protocol=2):
         Pickle protocol number, default is 2 to support compatible across python 2.x and 3.x.
 
     """
-    with smart_open(fname, 'wb') as fout:  # 'b' for binary, needed on Windows
+    with open(fname, 'wb') as fout:  # 'b' for binary, needed on Windows
         _pickle.dump(obj, fout, protocol=protocol)
diff --git a/gensim/models/deprecated/word2vec.py b/gensim/models/deprecated/word2vec.py
@@ -1413,7 +1413,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
         """
         overlap_count = 0
         logger.info("loading projection weights from %s", fname)
-        with utils.smart_open(fname) as fin:
+        with utils.open(fname, 'rb') as fin:
             header = utils.to_unicode(fin.readline(), encoding=encoding)
             vocab_size, vector_size = (int(x) for x in header.split())  # throws for invalid file format
             if not vector_size == self.vector_size:
@@ -1699,7 +1699,7 @@ def __iter__(self):
             fname = os.path.join(self.dirname, fname)
             if not os.path.isfile(fname):
                 continue
-            for line in utils.smart_open(fname):
+            for line in utils.open(fname, 'rb'):
                 line = utils.to_unicode(line)
                 # each file line is a single sentence in the Brown corpus
                 # each token is WORD/POS_TAG
@@ -1722,7 +1722,7 @@ def __iter__(self):
         # the entire corpus is one gigantic line -- there are no sentence marks at all
         # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
         sentence, rest = [], b''
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             while True:
                 text = rest + fin.read(8192)  # avoid loading the entire file (=1 line) into RAM
                 if text == rest:  # EOF
@@ -1778,7 +1778,7 @@ def __iter__(self):
                     i += self.max_sentence_length
         except AttributeError:
             # If it didn't work like a file, use it as a string filename
-            with utils.smart_open(self.source) as fin:
+            with utils.open(self.source, 'rb') as fin:
                 for line in itertools.islice(fin, self.limit):
                     line = utils.to_unicode(line).split()
                     i = 0
@@ -1833,7 +1833,7 @@ def __iter__(self):
         """iterate through the files"""
         for file_name in self.input_files:
             logger.info('reading file %s', file_name)
-            with utils.smart_open(file_name) as fin:
+            with utils.open(file_name, 'rb') as fin:
                 for line in itertools.islice(fin, self.limit):
                     line = utils.to_unicode(line).split()
                     i = 0