@@ -21,16 +21,14 @@ from cython import Py_ssize_t
21
21
from cpython cimport (PyObject, PyBytes_FromString,
22
22
PyBytes_AsString,
23
23
PyUnicode_AsUTF8String,
24
- PyErr_Occurred, PyErr_Fetch)
24
+ PyErr_Occurred, PyErr_Fetch,
25
+ PyUnicode_Decode)
25
26
from cpython.ref cimport Py_XDECREF
26
27
27
28
28
29
cdef extern from " Python.h" :
29
30
object PyUnicode_FromString(char * v)
30
31
31
- object PyUnicode_Decode(char * v, Py_ssize_t size, char * encoding,
32
- char * errors)
33
-
34
32
35
33
import numpy as np
36
34
cimport numpy as cnp
@@ -84,11 +82,6 @@ cdef extern from "headers/portable.h":
84
82
# loudly.
85
83
pass
86
84
87
- try :
88
- basestring
89
- except NameError :
90
- basestring = str
91
-
92
85
93
86
cdef extern from " parser/tokenizer.h" :
94
87
@@ -632,7 +625,7 @@ cdef class TextReader:
632
625
633
626
if self .compression:
634
627
if self .compression == ' gzip' :
635
- if isinstance (source, basestring ):
628
+ if isinstance (source, str ):
636
629
source = gzip.GzipFile(source, ' rb' )
637
630
else :
638
631
source = gzip.GzipFile(fileobj = source)
@@ -653,7 +646,7 @@ cdef class TextReader:
653
646
raise ValueError (' Multiple files found in compressed '
654
647
' zip file %s ' , str (zip_names))
655
648
elif self .compression == ' xz' :
656
- if isinstance (source, basestring ):
649
+ if isinstance (source, str ):
657
650
source = lzma.LZMAFile(source, ' rb' )
658
651
else :
659
652
source = lzma.LZMAFile(filename = source)
@@ -671,11 +664,10 @@ cdef class TextReader:
671
664
672
665
self .handle = source
673
666
674
- if isinstance (source, basestring ):
675
- if not isinstance (source, bytes):
676
- encoding = sys.getfilesystemencoding() or " utf-8"
667
+ if isinstance (source, str ):
668
+ encoding = sys.getfilesystemencoding() or " utf-8"
677
669
678
- source = source.encode(encoding)
670
+ source = source.encode(encoding)
679
671
680
672
if self .memory_map:
681
673
ptr = new_mmap(source)
@@ -768,9 +760,7 @@ cdef class TextReader:
768
760
for i in range (field_count):
769
761
word = self .parser.words[start + i]
770
762
771
- if path == CSTRING:
772
- name = PyBytes_FromString(word)
773
- elif path == UTF8:
763
+ if path == UTF8:
774
764
name = PyUnicode_FromString(word)
775
765
elif path == ENCODED:
776
766
name = PyUnicode_Decode(word, strlen(word),
@@ -1309,9 +1299,6 @@ cdef class TextReader:
1309
1299
elif path == ENCODED:
1310
1300
return _string_box_decode(self .parser, i, start, end,
1311
1301
na_filter, na_hashset, self .c_encoding)
1312
- elif path == CSTRING:
1313
- return _string_box_factorize(self .parser, i, start, end,
1314
- na_filter, na_hashset)
1315
1302
1316
1303
def _get_converter (self , i , name ):
1317
1304
if self .converters is None :
@@ -1389,7 +1376,7 @@ cdef:
1389
1376
def _ensure_encoded (list lst ):
1390
1377
cdef list result = []
1391
1378
for x in lst:
1392
- if isinstance (x, unicode ):
1379
+ if isinstance (x, str ):
1393
1380
x = PyUnicode_AsUTF8String(x)
1394
1381
elif not isinstance (x, bytes):
1395
1382
x = str (x).encode(' utf-8' )
@@ -1421,7 +1408,6 @@ def _maybe_upcast(arr):
1421
1408
1422
1409
1423
1410
cdef enum StringPath:
1424
- CSTRING
1425
1411
UTF8
1426
1412
ENCODED
1427
1413
@@ -1663,10 +1649,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
1663
1649
for k in range (table.n_buckets):
1664
1650
if kh_exist_str(table, k):
1665
1651
result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
1666
- elif path == CSTRING:
1667
- for k in range (table.n_buckets):
1668
- if kh_exist_str(table, k):
1669
- result[table.vals[k]] = PyBytes_FromString(table.keys[k])
1670
1652
1671
1653
kh_destroy_str(table)
1672
1654
return np.asarray(codes), result, na_count
0 commit comments