BUG: Don't parse NaN as 'nan' in Data IO

gfyoung · gfyoung · commit dafaf5c5c2a5 · 2018-10-14T22:41:22.000-07:00
Closes pandas-devgh-20377.
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -494,24 +494,38 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
     return result
 
 
-def astype_unicode(arr: ndarray) -> ndarray[object]:
+def astype_unicode(arr: ndarray,
+                   skipna: bool=False) -> ndarray[object]:
     cdef:
+        object arr_i
         Py_ssize_t i, n = arr.size
         ndarray[object] result = np.empty(n, dtype=object)
 
     for i in range(n):
-        result[i] = unicode(arr[i])
+        arr_i = arr[i]
+
+        if not (skipna and checknull(arr_i)):
+            arr_i = unicode(arr_i)
+
+        result[i] = arr_i
 
     return result
 
 
-def astype_str(arr: ndarray) -> ndarray[object]:
+def astype_str(arr: ndarray,
+               skipna: bool=False) -> ndarray[object]:
     cdef:
+        object arr_i
         Py_ssize_t i, n = arr.size
         ndarray[object] result = np.empty(n, dtype=object)
 
     for i in range(n):
-        result[i] = str(arr[i])
+        arr_i = arr[i]
+
+        if not (skipna and checknull(arr_i)):
+            arr_i = str(arr_i)
+
+        result[i] = arr_i
 
     return result
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -14,9 +14,9 @@
 
 from pandas import compat
 from pandas.compat import (range, lrange, PY3, StringIO, lzip,
-                           zip, string_types, map, u)
+                           zip, text_type, string_types, map, u)
 from pandas.core.dtypes.common import (
-    is_integer, ensure_object,
+    pandas_dtype, is_integer, ensure_object,
     is_list_like, is_integer_dtype,
     is_float, is_dtype_equal,
     is_object_dtype, is_string_dtype,
@@ -1685,7 +1685,22 @@ def _cast_types(self, values, cast_type, column):
 
         else:
             try:
-                values = astype_nansafe(values, cast_type, copy=True)
+                # gh-20377
+                #
+                # The C parser does not convert np.NaN to 'nan' if
+                # the casted dtype is a string-like.
+                cast_type = pandas_dtype(cast_type)
+
+                if issubclass(cast_type.type, string_types + (text_type,)):
+                    val_shape = values.shape
+                    val_flat = values.ravel()
+
+                    method = ("astype_unicode" if issubclass(
+                        cast_type.type, text_type) else "astype_str")
+                    values = getattr(lib, method)(
+                        val_flat, skipna=True).reshape(val_shape)
+                else:
+                    values = astype_nansafe(values, cast_type, copy=True)
             except ValueError:
                 raise ValueError("Unable to convert column %s to "
                                  "type %s" % (column, cast_type))
diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py
@@ -5,6 +5,7 @@
 parsing for all of the parsers defined in parsers.py
 """
 
+import pytest
 import numpy as np
 from numpy import nan
 
@@ -380,3 +381,18 @@ def test_inf_na_values_with_int_index(self):
         expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
                              index=Index([1, 2], name="idx"))
         tm.assert_frame_equal(out, expected)
+
+    @pytest.mark.parametrize("na_filter", [True, False])
+    def test_na_values_with_dtype_str_and_na_filter(self, na_filter):
+        # see gh-20377
+        data = "a,b,c\n1,,3\n4,5,6"
+
+        # na_filter=True --> missing value becomes NaN.
+        # na_filter=False --> missing value remains empty string.
+        empty = np.nan if na_filter else ""
+        expected = DataFrame({"a": ["1", "4"],
+                              "b": [empty, "5"],
+                              "c": ["3", "6"]})
+
+        result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -14,7 +14,7 @@
 import pandas as pd
 import pandas.util.testing as tm
 import pandas.util._test_decorators as td
-from pandas import DataFrame, Index, MultiIndex
+from pandas import DataFrame, Index, MultiIndex, Series
 from pandas.compat import u, range, map, BytesIO, iteritems, PY36
 from pandas.core.config import set_option, get_option
 from pandas.io.common import URLError
@@ -371,7 +371,34 @@ def test_reader_dtype(self, ext):
         tm.assert_frame_equal(actual, expected)
 
         with pytest.raises(ValueError):
-            actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
+            self.get_exceldf(basename, ext, dtype={'d': 'int64'})
+
+    @pytest.mark.parametrize("dtype,expected", [
+        (None,
+         DataFrame({
+             "a": [1, 2, 3, 4],
+             "b": [2.5, 3.5, 4.5, 5.5],
+             "c": [1, 2, 3, 4],
+             "d": [1.0, 2.0, np.nan, 4.0]
+         })),
+        ({"a": "float64",
+          "b": "float32",
+          "c": str,
+          "d": str
+          },
+         DataFrame({
+             "a": Series([1, 2, 3, 4], dtype="float64"),
+             "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
+             "c": ["001", "002", "003", "004"],
+             "d": ["1", "2", np.nan, "4"]
+         })),
+    ])
+    def test_reader_dtype_str(self, ext, dtype, expected):
+        # see gh-20377
+        basename = "testdtype"
+
+        actual = self.get_exceldf(basename, ext, dtype=dtype)
+        tm.assert_frame_equal(actual, expected)
 
     def test_reading_all_sheets(self, ext):
         # Test reading all sheetnames by setting sheetname to None,