diff --git a/doc/source/io.rst b/doc/source/io.rst index cc693170f055a..26e928020b893 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -208,7 +208,7 @@ memory_map : boolean, default False NA and Missing Data Handling ++++++++++++++++++++++++++++ -na_values : str, list-like or dict, default ``None`` +na_values : scalar, str, list-like, or dict, default ``None`` Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA', diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index e60ac7f3773f0..08b59390339aa 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -957,6 +957,7 @@ Bug Fixes - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) - Bug in ``groupby().cumsum()`` calculating ``cumprod`` when ``axis=1``. (:issue:`13994`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) +- Bug in ``pd.read_csv()``, which caused errors to be raised when a dictionary containing scalars is passed in for ``na_values`` (:issue:`12224`) - Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) - Bug in ``pd.read_csv()`` with ``engine='python'`` which raised errors when a numpy array was passed in for ``usecols`` (:issue:`12546`) - Bug in ``pd.to_timedelta()`` in which the ``errors`` parameter was not being respected (:issue:`13613`) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index c713cafc0e110..5e4dd4379a8e3 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -94,7 +94,7 @@ column ranges (e.g. "A:E" or "A,C,E:F") squeeze : boolean, default False If the parsed data only contains one column then return a Series -na_values : str or list-like or dict, default None +na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: '""" + "', '".join(sorted(_NA_VALUES)) + """'. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9a7c966031044..e40ea611fcd0a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -129,7 +129,7 @@ DEPRECATED: use the `skipfooter` parameter instead, as they are identical nrows : int, default None Number of rows of file to read. Useful for reading pieces of large files -na_values : str or list-like or dict, default None +na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: `'""" + "'`, `'".join(sorted(_NA_VALUES)) + """'`. @@ -1604,8 +1604,8 @@ def TextParser(*args, **kwds): has_index_names: boolean, default False True if the cols defined in index_col have an index name and are not in the header - na_values : iterable, default None - Custom NA values + na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. keep_default_na : bool, default True thousands : str, default None Thousands separator @@ -2687,7 +2687,9 @@ def _clean_na_values(na_values, keep_default_na=True): elif isinstance(na_values, dict): if keep_default_na: for k, v in compat.iteritems(na_values): - v = set(list(v)) | _NA_VALUES + if not is_list_like(v): + v = [v] + v = set(v) | _NA_VALUES na_values[k] = v na_fvalues = dict([ (k, _floatify_na_values(v)) for k, v in na_values.items() # noqa diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index 2a8c934abce61..92107cf2e82a7 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -250,3 +250,19 @@ def test_na_trailing_columns(self): result = self.read_csv(StringIO(data)) self.assertEqual(result['Date'][1], '2012-05-12') self.assertTrue(result['UnitPrice'].isnull().all()) + + def test_na_values_scalar(self): + # see gh-12224 + names = ['a', 'b'] + data = '1,2\n2,1' + + expected = DataFrame([[np.nan, 2.0], [2.0, np.nan]], + columns=names) + out = self.read_csv(StringIO(data), names=names, na_values=1) + tm.assert_frame_equal(out, expected) + + expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], + columns=names) + out = self.read_csv(StringIO(data), names=names, + na_values={'a': 2, 'b': 1}) + tm.assert_frame_equal(out, expected)