Python2 String Handling Cleanup in parsers.pyx (#26270)

WillAyd · jreback · commit a8d61d39d252 · 2019-05-03T09:23:35.000-04:00
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -21,16 +21,14 @@ from cython import Py_ssize_t
 from cpython cimport (PyObject, PyBytes_FromString,
                       PyBytes_AsString,
                       PyUnicode_AsUTF8String,
-                      PyErr_Occurred, PyErr_Fetch)
+                      PyErr_Occurred, PyErr_Fetch,
+                      PyUnicode_Decode)
 from cpython.ref cimport Py_XDECREF
 
 
 cdef extern from "Python.h":
     object PyUnicode_FromString(char *v)
 
-    object PyUnicode_Decode(char *v, Py_ssize_t size, char *encoding,
-                            char *errors)
-
 
 import numpy as np
 cimport numpy as cnp
@@ -84,11 +82,6 @@ cdef extern from "headers/portable.h":
     # loudly.
     pass
 
-try:
-    basestring
-except NameError:
-    basestring = str
-
 
 cdef extern from "parser/tokenizer.h":
 
@@ -632,7 +625,7 @@ cdef class TextReader:
 
         if self.compression:
             if self.compression == 'gzip':
-                if isinstance(source, basestring):
+                if isinstance(source, str):
                     source = gzip.GzipFile(source, 'rb')
                 else:
                     source = gzip.GzipFile(fileobj=source)
@@ -653,7 +646,7 @@ cdef class TextReader:
                     raise ValueError('Multiple files found in compressed '
                                      'zip file %s', str(zip_names))
             elif self.compression == 'xz':
-                if isinstance(source, basestring):
+                if isinstance(source, str):
                     source = lzma.LZMAFile(source, 'rb')
                 else:
                     source = lzma.LZMAFile(filename=source)
@@ -671,11 +664,10 @@ cdef class TextReader:
 
             self.handle = source
 
-        if isinstance(source, basestring):
-            if not isinstance(source, bytes):
-                encoding = sys.getfilesystemencoding() or "utf-8"
+        if isinstance(source, str):
+            encoding = sys.getfilesystemencoding() or "utf-8"
 
-                source = source.encode(encoding)
+            source = source.encode(encoding)
 
             if self.memory_map:
                 ptr = new_mmap(source)
@@ -768,9 +760,7 @@ cdef class TextReader:
                 for i in range(field_count):
                     word = self.parser.words[start + i]
 
-                    if path == CSTRING:
-                        name = PyBytes_FromString(word)
-                    elif path == UTF8:
+                    if path == UTF8:
                         name = PyUnicode_FromString(word)
                     elif path == ENCODED:
                         name = PyUnicode_Decode(word, strlen(word),
@@ -1309,9 +1299,6 @@ cdef class TextReader:
         elif path == ENCODED:
             return _string_box_decode(self.parser, i, start, end,
                                       na_filter, na_hashset, self.c_encoding)
-        elif path == CSTRING:
-            return _string_box_factorize(self.parser, i, start, end,
-                                         na_filter, na_hashset)
 
     def _get_converter(self, i, name):
         if self.converters is None:
@@ -1389,7 +1376,7 @@ cdef:
 def _ensure_encoded(list lst):
     cdef list result = []
     for x in lst:
-        if isinstance(x, unicode):
+        if isinstance(x, str):
             x = PyUnicode_AsUTF8String(x)
         elif not isinstance(x, bytes):
             x = str(x).encode('utf-8')
@@ -1421,7 +1408,6 @@ def _maybe_upcast(arr):
 
 
 cdef enum StringPath:
-    CSTRING
     UTF8
     ENCODED
 
@@ -1663,10 +1649,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
         for k in range(table.n_buckets):
             if kh_exist_str(table, k):
                 result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
-    elif path == CSTRING:
-        for k in range(table.n_buckets):
-            if kh_exist_str(table, k):
-                result[table.vals[k]] = PyBytes_FromString(table.keys[k])
 
     kh_destroy_str(table)
     return np.asarray(codes), result, na_count