Skip to content

Commit dafaf5c

Browse files
committed
BUG: Don't parse NaN as 'nan' in Data IO
Closes pandas-devgh-20377.
1 parent cf11f71 commit dafaf5c

File tree

4 files changed

+81
-9
lines changed

4 files changed

+81
-9
lines changed

pandas/_libs/lib.pyx

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -494,24 +494,38 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
494494
return result
495495

496496

497-
def astype_unicode(arr: ndarray) -> ndarray[object]:
497+
def astype_unicode(arr: ndarray,
498+
skipna: bool=False) -> ndarray[object]:
498499
cdef:
500+
object arr_i
499501
Py_ssize_t i, n = arr.size
500502
ndarray[object] result = np.empty(n, dtype=object)
501503

502504
for i in range(n):
503-
result[i] = unicode(arr[i])
505+
arr_i = arr[i]
506+
507+
if not (skipna and checknull(arr_i)):
508+
arr_i = unicode(arr_i)
509+
510+
result[i] = arr_i
504511

505512
return result
506513

507514

508-
def astype_str(arr: ndarray) -> ndarray[object]:
515+
def astype_str(arr: ndarray,
516+
skipna: bool=False) -> ndarray[object]:
509517
cdef:
518+
object arr_i
510519
Py_ssize_t i, n = arr.size
511520
ndarray[object] result = np.empty(n, dtype=object)
512521

513522
for i in range(n):
514-
result[i] = str(arr[i])
523+
arr_i = arr[i]
524+
525+
if not (skipna and checknull(arr_i)):
526+
arr_i = str(arr_i)
527+
528+
result[i] = arr_i
515529

516530
return result
517531

pandas/io/parsers.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414

1515
from pandas import compat
1616
from pandas.compat import (range, lrange, PY3, StringIO, lzip,
17-
zip, string_types, map, u)
17+
zip, text_type, string_types, map, u)
1818
from pandas.core.dtypes.common import (
19-
is_integer, ensure_object,
19+
pandas_dtype, is_integer, ensure_object,
2020
is_list_like, is_integer_dtype,
2121
is_float, is_dtype_equal,
2222
is_object_dtype, is_string_dtype,
@@ -1685,7 +1685,22 @@ def _cast_types(self, values, cast_type, column):
16851685

16861686
else:
16871687
try:
1688-
values = astype_nansafe(values, cast_type, copy=True)
1688+
# gh-20377
1689+
#
1690+
# The C parser does not convert np.NaN to 'nan' if
1691+
# the casted dtype is a string-like.
1692+
cast_type = pandas_dtype(cast_type)
1693+
1694+
if issubclass(cast_type.type, string_types + (text_type,)):
1695+
val_shape = values.shape
1696+
val_flat = values.ravel()
1697+
1698+
method = ("astype_unicode" if issubclass(
1699+
cast_type.type, text_type) else "astype_str")
1700+
values = getattr(lib, method)(
1701+
val_flat, skipna=True).reshape(val_shape)
1702+
else:
1703+
values = astype_nansafe(values, cast_type, copy=True)
16891704
except ValueError:
16901705
raise ValueError("Unable to convert column %s to "
16911706
"type %s" % (column, cast_type))

pandas/tests/io/parser/na_values.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
parsing for all of the parsers defined in parsers.py
66
"""
77

8+
import pytest
89
import numpy as np
910
from numpy import nan
1011

@@ -380,3 +381,18 @@ def test_inf_na_values_with_int_index(self):
380381
expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
381382
index=Index([1, 2], name="idx"))
382383
tm.assert_frame_equal(out, expected)
384+
385+
@pytest.mark.parametrize("na_filter", [True, False])
386+
def test_na_values_with_dtype_str_and_na_filter(self, na_filter):
387+
# see gh-20377
388+
data = "a,b,c\n1,,3\n4,5,6"
389+
390+
# na_filter=True --> missing value becomes NaN.
391+
# na_filter=False --> missing value remains empty string.
392+
empty = np.nan if na_filter else ""
393+
expected = DataFrame({"a": ["1", "4"],
394+
"b": [empty, "5"],
395+
"c": ["3", "6"]})
396+
397+
result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
398+
tm.assert_frame_equal(result, expected)

pandas/tests/io/test_excel.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import pandas as pd
1515
import pandas.util.testing as tm
1616
import pandas.util._test_decorators as td
17-
from pandas import DataFrame, Index, MultiIndex
17+
from pandas import DataFrame, Index, MultiIndex, Series
1818
from pandas.compat import u, range, map, BytesIO, iteritems, PY36
1919
from pandas.core.config import set_option, get_option
2020
from pandas.io.common import URLError
@@ -371,7 +371,34 @@ def test_reader_dtype(self, ext):
371371
tm.assert_frame_equal(actual, expected)
372372

373373
with pytest.raises(ValueError):
374-
actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
374+
self.get_exceldf(basename, ext, dtype={'d': 'int64'})
375+
376+
@pytest.mark.parametrize("dtype,expected", [
377+
(None,
378+
DataFrame({
379+
"a": [1, 2, 3, 4],
380+
"b": [2.5, 3.5, 4.5, 5.5],
381+
"c": [1, 2, 3, 4],
382+
"d": [1.0, 2.0, np.nan, 4.0]
383+
})),
384+
({"a": "float64",
385+
"b": "float32",
386+
"c": str,
387+
"d": str
388+
},
389+
DataFrame({
390+
"a": Series([1, 2, 3, 4], dtype="float64"),
391+
"b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
392+
"c": ["001", "002", "003", "004"],
393+
"d": ["1", "2", np.nan, "4"]
394+
})),
395+
])
396+
def test_reader_dtype_str(self, ext, dtype, expected):
397+
# see gh-20377
398+
basename = "testdtype"
399+
400+
actual = self.get_exceldf(basename, ext, dtype=dtype)
401+
tm.assert_frame_equal(actual, expected)
375402

376403
def test_reading_all_sheets(self, ext):
377404
# Test reading all sheetnames by setting sheetname to None,

0 commit comments

Comments
 (0)