-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Python2 String Handling Cleanup in parsers.pyx #26270
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -21,16 +21,14 @@ from cython import Py_ssize_t | |||
from cpython cimport (PyObject, PyBytes_FromString, | ||||
PyBytes_AsString, | ||||
PyUnicode_AsUTF8String, | ||||
PyErr_Occurred, PyErr_Fetch) | ||||
PyErr_Occurred, PyErr_Fetch, | ||||
PyUnicode_Decode) | ||||
from cpython.ref cimport Py_XDECREF | ||||
|
||||
|
||||
cdef extern from "Python.h": | ||||
object PyUnicode_FromString(char *v) | ||||
|
||||
object PyUnicode_Decode(char *v, Py_ssize_t size, char *encoding, | ||||
char *errors) | ||||
|
||||
|
||||
import numpy as np | ||||
cimport numpy as cnp | ||||
|
@@ -84,11 +82,6 @@ cdef extern from "headers/portable.h": | |||
# loudly. | ||||
pass | ||||
|
||||
try: | ||||
basestring | ||||
except NameError: | ||||
basestring = str | ||||
|
||||
|
||||
cdef extern from "parser/tokenizer.h": | ||||
|
||||
|
@@ -632,7 +625,7 @@ cdef class TextReader: | |||
|
||||
if self.compression: | ||||
if self.compression == 'gzip': | ||||
if isinstance(source, basestring): | ||||
if isinstance(source, str): | ||||
source = gzip.GzipFile(source, 'rb') | ||||
else: | ||||
source = gzip.GzipFile(fileobj=source) | ||||
|
@@ -653,7 +646,7 @@ cdef class TextReader: | |||
raise ValueError('Multiple files found in compressed ' | ||||
'zip file %s', str(zip_names)) | ||||
elif self.compression == 'xz': | ||||
if isinstance(source, basestring): | ||||
if isinstance(source, str): | ||||
source = lzma.LZMAFile(source, 'rb') | ||||
else: | ||||
source = lzma.LZMAFile(filename=source) | ||||
|
@@ -671,11 +664,10 @@ cdef class TextReader: | |||
|
||||
self.handle = source | ||||
|
||||
if isinstance(source, basestring): | ||||
if not isinstance(source, bytes): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To get to this point in Py3 the object would have to be a str in the first place, which means it logically isn't a bytes type hence why I removed this condition altogether |
||||
encoding = sys.getfilesystemencoding() or "utf-8" | ||||
if isinstance(source, str): | ||||
encoding = sys.getfilesystemencoding() or "utf-8" | ||||
|
||||
source = source.encode(encoding) | ||||
source = source.encode(encoding) | ||||
|
||||
if self.memory_map: | ||||
ptr = new_mmap(source) | ||||
|
@@ -768,9 +760,7 @@ cdef class TextReader: | |||
for i in range(field_count): | ||||
word = self.parser.words[start + i] | ||||
|
||||
if path == CSTRING: | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CSTRING was not a possible value in this enum hence removal. See here: pandas/pandas/_libs/parsers.pyx Line 1430 in 0989339
|
||||
name = PyBytes_FromString(word) | ||||
elif path == UTF8: | ||||
if path == UTF8: | ||||
name = PyUnicode_FromString(word) | ||||
elif path == ENCODED: | ||||
name = PyUnicode_Decode(word, strlen(word), | ||||
|
@@ -1309,9 +1299,6 @@ cdef class TextReader: | |||
elif path == ENCODED: | ||||
return _string_box_decode(self.parser, i, start, end, | ||||
na_filter, na_hashset, self.c_encoding) | ||||
elif path == CSTRING: | ||||
return _string_box_factorize(self.parser, i, start, end, | ||||
na_filter, na_hashset) | ||||
|
||||
def _get_converter(self, i, name): | ||||
if self.converters is None: | ||||
|
@@ -1389,7 +1376,7 @@ cdef: | |||
def _ensure_encoded(list lst): | ||||
cdef list result = [] | ||||
for x in lst: | ||||
if isinstance(x, unicode): | ||||
if isinstance(x, str): | ||||
x = PyUnicode_AsUTF8String(x) | ||||
elif not isinstance(x, bytes): | ||||
x = str(x).encode('utf-8') | ||||
|
@@ -1421,7 +1408,6 @@ def _maybe_upcast(arr): | |||
|
||||
|
||||
cdef enum StringPath: | ||||
CSTRING | ||||
UTF8 | ||||
ENCODED | ||||
|
||||
|
@@ -1663,10 +1649,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col, | |||
for k in range(table.n_buckets): | ||||
if kh_exist_str(table, k): | ||||
result[table.vals[k]] = PyUnicode_FromString(table.keys[k]) | ||||
elif path == CSTRING: | ||||
for k in range(table.n_buckets): | ||||
if kh_exist_str(table, k): | ||||
result[table.vals[k]] = PyBytes_FromString(table.keys[k]) | ||||
|
||||
kh_destroy_str(table) | ||||
return np.asarray(codes), result, na_count | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was orthogonal but figured preferable to import this way rather than through Python header (cython doesn't expose the last remaining item coming from Python.h)