Skip to content

Python2 String Handling Cleanup in parsers.pyx #26270

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 3, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 9 additions & 27 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,14 @@ from cython import Py_ssize_t
from cpython cimport (PyObject, PyBytes_FromString,
PyBytes_AsString,
PyUnicode_AsUTF8String,
PyErr_Occurred, PyErr_Fetch)
PyErr_Occurred, PyErr_Fetch,
PyUnicode_Decode)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was orthogonal but figured preferable to import this way rather than through Python header (cython doesn't expose the last remaining item coming from Python.h)

from cpython.ref cimport Py_XDECREF


cdef extern from "Python.h":
object PyUnicode_FromString(char *v)

object PyUnicode_Decode(char *v, Py_ssize_t size, char *encoding,
char *errors)


import numpy as np
cimport numpy as cnp
Expand Down Expand Up @@ -84,11 +82,6 @@ cdef extern from "headers/portable.h":
# loudly.
pass

try:
basestring
except NameError:
basestring = str


cdef extern from "parser/tokenizer.h":

Expand Down Expand Up @@ -632,7 +625,7 @@ cdef class TextReader:

if self.compression:
if self.compression == 'gzip':
if isinstance(source, basestring):
if isinstance(source, str):
source = gzip.GzipFile(source, 'rb')
else:
source = gzip.GzipFile(fileobj=source)
Expand All @@ -653,7 +646,7 @@ cdef class TextReader:
raise ValueError('Multiple files found in compressed '
'zip file %s', str(zip_names))
elif self.compression == 'xz':
if isinstance(source, basestring):
if isinstance(source, str):
source = lzma.LZMAFile(source, 'rb')
else:
source = lzma.LZMAFile(filename=source)
Expand All @@ -671,11 +664,10 @@ cdef class TextReader:

self.handle = source

if isinstance(source, basestring):
if not isinstance(source, bytes):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To get to this point in Py3 the object would have to be a str in the first place, which means it logically isn't a bytes type hence why I removed this condition altogether

encoding = sys.getfilesystemencoding() or "utf-8"
if isinstance(source, str):
encoding = sys.getfilesystemencoding() or "utf-8"

source = source.encode(encoding)
source = source.encode(encoding)

if self.memory_map:
ptr = new_mmap(source)
Expand Down Expand Up @@ -768,9 +760,7 @@ cdef class TextReader:
for i in range(field_count):
word = self.parser.words[start + i]

if path == CSTRING:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CSTRING was not a possible value in this enum hence removal. See here:

cdef inline StringPath _string_path(char *encoding):

name = PyBytes_FromString(word)
elif path == UTF8:
if path == UTF8:
name = PyUnicode_FromString(word)
elif path == ENCODED:
name = PyUnicode_Decode(word, strlen(word),
Expand Down Expand Up @@ -1309,9 +1299,6 @@ cdef class TextReader:
elif path == ENCODED:
return _string_box_decode(self.parser, i, start, end,
na_filter, na_hashset, self.c_encoding)
elif path == CSTRING:
return _string_box_factorize(self.parser, i, start, end,
na_filter, na_hashset)

def _get_converter(self, i, name):
if self.converters is None:
Expand Down Expand Up @@ -1389,7 +1376,7 @@ cdef:
def _ensure_encoded(list lst):
cdef list result = []
for x in lst:
if isinstance(x, unicode):
if isinstance(x, str):
x = PyUnicode_AsUTF8String(x)
elif not isinstance(x, bytes):
x = str(x).encode('utf-8')
Expand Down Expand Up @@ -1421,7 +1408,6 @@ def _maybe_upcast(arr):


cdef enum StringPath:
CSTRING
UTF8
ENCODED

Expand Down Expand Up @@ -1663,10 +1649,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
for k in range(table.n_buckets):
if kh_exist_str(table, k):
result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
elif path == CSTRING:
for k in range(table.n_buckets):
if kh_exist_str(table, k):
result[table.vals[k]] = PyBytes_FromString(table.keys[k])

kh_destroy_str(table)
return np.asarray(codes), result, na_count
Expand Down