Skip to content

Commit a8d61d3

Browse files
WillAydjreback
authored andcommitted
Python2 String Handling Cleanup in parsers.pyx (#26270)
1 parent e854ccf commit a8d61d3

File tree

1 file changed

+9
-27
lines changed

1 file changed

+9
-27
lines changed

pandas/_libs/parsers.pyx

+9-27
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,14 @@ from cython import Py_ssize_t
2121
from cpython cimport (PyObject, PyBytes_FromString,
2222
PyBytes_AsString,
2323
PyUnicode_AsUTF8String,
24-
PyErr_Occurred, PyErr_Fetch)
24+
PyErr_Occurred, PyErr_Fetch,
25+
PyUnicode_Decode)
2526
from cpython.ref cimport Py_XDECREF
2627

2728

2829
cdef extern from "Python.h":
2930
object PyUnicode_FromString(char *v)
3031

31-
object PyUnicode_Decode(char *v, Py_ssize_t size, char *encoding,
32-
char *errors)
33-
3432

3533
import numpy as np
3634
cimport numpy as cnp
@@ -84,11 +82,6 @@ cdef extern from "headers/portable.h":
8482
# loudly.
8583
pass
8684

87-
try:
88-
basestring
89-
except NameError:
90-
basestring = str
91-
9285

9386
cdef extern from "parser/tokenizer.h":
9487

@@ -632,7 +625,7 @@ cdef class TextReader:
632625

633626
if self.compression:
634627
if self.compression == 'gzip':
635-
if isinstance(source, basestring):
628+
if isinstance(source, str):
636629
source = gzip.GzipFile(source, 'rb')
637630
else:
638631
source = gzip.GzipFile(fileobj=source)
@@ -653,7 +646,7 @@ cdef class TextReader:
653646
raise ValueError('Multiple files found in compressed '
654647
'zip file %s', str(zip_names))
655648
elif self.compression == 'xz':
656-
if isinstance(source, basestring):
649+
if isinstance(source, str):
657650
source = lzma.LZMAFile(source, 'rb')
658651
else:
659652
source = lzma.LZMAFile(filename=source)
@@ -671,11 +664,10 @@ cdef class TextReader:
671664

672665
self.handle = source
673666

674-
if isinstance(source, basestring):
675-
if not isinstance(source, bytes):
676-
encoding = sys.getfilesystemencoding() or "utf-8"
667+
if isinstance(source, str):
668+
encoding = sys.getfilesystemencoding() or "utf-8"
677669

678-
source = source.encode(encoding)
670+
source = source.encode(encoding)
679671

680672
if self.memory_map:
681673
ptr = new_mmap(source)
@@ -768,9 +760,7 @@ cdef class TextReader:
768760
for i in range(field_count):
769761
word = self.parser.words[start + i]
770762

771-
if path == CSTRING:
772-
name = PyBytes_FromString(word)
773-
elif path == UTF8:
763+
if path == UTF8:
774764
name = PyUnicode_FromString(word)
775765
elif path == ENCODED:
776766
name = PyUnicode_Decode(word, strlen(word),
@@ -1309,9 +1299,6 @@ cdef class TextReader:
13091299
elif path == ENCODED:
13101300
return _string_box_decode(self.parser, i, start, end,
13111301
na_filter, na_hashset, self.c_encoding)
1312-
elif path == CSTRING:
1313-
return _string_box_factorize(self.parser, i, start, end,
1314-
na_filter, na_hashset)
13151302

13161303
def _get_converter(self, i, name):
13171304
if self.converters is None:
@@ -1389,7 +1376,7 @@ cdef:
13891376
def _ensure_encoded(list lst):
13901377
cdef list result = []
13911378
for x in lst:
1392-
if isinstance(x, unicode):
1379+
if isinstance(x, str):
13931380
x = PyUnicode_AsUTF8String(x)
13941381
elif not isinstance(x, bytes):
13951382
x = str(x).encode('utf-8')
@@ -1421,7 +1408,6 @@ def _maybe_upcast(arr):
14211408

14221409

14231410
cdef enum StringPath:
1424-
CSTRING
14251411
UTF8
14261412
ENCODED
14271413

@@ -1663,10 +1649,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
16631649
for k in range(table.n_buckets):
16641650
if kh_exist_str(table, k):
16651651
result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
1666-
elif path == CSTRING:
1667-
for k in range(table.n_buckets):
1668-
if kh_exist_str(table, k):
1669-
result[table.vals[k]] = PyBytes_FromString(table.keys[k])
16701652

16711653
kh_destroy_str(table)
16721654
return np.asarray(codes), result, na_count

0 commit comments

Comments
 (0)