diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a6512080eb428..e49e67b79e000 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -21,16 +21,14 @@ from cython import Py_ssize_t from cpython cimport (PyObject, PyBytes_FromString, PyBytes_AsString, PyUnicode_AsUTF8String, - PyErr_Occurred, PyErr_Fetch) + PyErr_Occurred, PyErr_Fetch, + PyUnicode_Decode) from cpython.ref cimport Py_XDECREF cdef extern from "Python.h": object PyUnicode_FromString(char *v) - object PyUnicode_Decode(char *v, Py_ssize_t size, char *encoding, - char *errors) - import numpy as np cimport numpy as cnp @@ -84,11 +82,6 @@ cdef extern from "headers/portable.h": # loudly. pass -try: - basestring -except NameError: - basestring = str - cdef extern from "parser/tokenizer.h": @@ -632,7 +625,7 @@ cdef class TextReader: if self.compression: if self.compression == 'gzip': - if isinstance(source, basestring): + if isinstance(source, str): source = gzip.GzipFile(source, 'rb') else: source = gzip.GzipFile(fileobj=source) @@ -653,7 +646,7 @@ cdef class TextReader: raise ValueError('Multiple files found in compressed ' 'zip file %s', str(zip_names)) elif self.compression == 'xz': - if isinstance(source, basestring): + if isinstance(source, str): source = lzma.LZMAFile(source, 'rb') else: source = lzma.LZMAFile(filename=source) @@ -671,11 +664,10 @@ cdef class TextReader: self.handle = source - if isinstance(source, basestring): - if not isinstance(source, bytes): - encoding = sys.getfilesystemencoding() or "utf-8" + if isinstance(source, str): + encoding = sys.getfilesystemencoding() or "utf-8" - source = source.encode(encoding) + source = source.encode(encoding) if self.memory_map: ptr = new_mmap(source) @@ -768,9 +760,7 @@ cdef class TextReader: for i in range(field_count): word = self.parser.words[start + i] - if path == CSTRING: - name = PyBytes_FromString(word) - elif path == UTF8: + if path == UTF8: name = PyUnicode_FromString(word) elif path == ENCODED: name = PyUnicode_Decode(word, strlen(word), @@ -1309,9 +1299,6 @@ cdef class TextReader: elif path == ENCODED: return _string_box_decode(self.parser, i, start, end, na_filter, na_hashset, self.c_encoding) - elif path == CSTRING: - return _string_box_factorize(self.parser, i, start, end, - na_filter, na_hashset) def _get_converter(self, i, name): if self.converters is None: @@ -1389,7 +1376,7 @@ cdef: def _ensure_encoded(list lst): cdef list result = [] for x in lst: - if isinstance(x, unicode): + if isinstance(x, str): x = PyUnicode_AsUTF8String(x) elif not isinstance(x, bytes): x = str(x).encode('utf-8') @@ -1421,7 +1408,6 @@ def _maybe_upcast(arr): cdef enum StringPath: - CSTRING UTF8 ENCODED @@ -1663,10 +1649,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col, for k in range(table.n_buckets): if kh_exist_str(table, k): result[table.vals[k]] = PyUnicode_FromString(table.keys[k]) - elif path == CSTRING: - for k in range(table.n_buckets): - if kh_exist_str(table, k): - result[table.vals[k]] = PyBytes_FromString(table.keys[k]) kh_destroy_str(table) return np.asarray(codes), result, na_count