diff --git a/doc/source/api.rst b/doc/source/api.rst index a377fa3960d4c..e964ce569532a 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -126,6 +126,13 @@ Data manipulations merge concat +.. currentmodule:: pandas.core.reshape + +.. autosummary:: + :toctree: generated/ + + get_dummies + Top-level missing data ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 56d51183a1834..7e8137b876a8c 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -44,6 +44,7 @@ pandas 0.13 ``ValueError`` (:issue:`4303`, :issue:`4305`) - ``read_excel`` now supports an integer in its ``sheetname`` argument giving the index of the sheet to read in (:issue:`4301`). + - ``get_dummies`` works with NaN (:issue:`4446`) - Added a test for ``read_clipboard()`` and ``to_clipboard()`` (:issue:`4282`) - Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf", "iNf", etc.) to infinity. (:issue:`4220`, :issue:`4219`), affecting diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 43ad0c32b0bfe..022799cd88014 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -129,6 +129,17 @@ Enhancements - Added a more informative error message when plot arguments contain overlapping color and style arguments (:issue:`4402`) + - NaN handing in get_dummies (:issue:`4446`) with `dummy_na` + + .. ipython:: python + # previously, nan was erroneously counted as 2 here + # now it is not counted at all + get_dummies([1, 2, np.nan]) + + # unless requested + get_dummies([1, 2, np.nan], dummy_na=True) + + - ``timedelta64[ns]`` operations - A Series of dtype ``timedelta64[ns]`` can now be divided by another diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 0ac45e52d64fc..a8a36ef8ca0be 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -18,7 +18,7 @@ import pandas.core.common as com import pandas.algos as algos -from pandas.core.index import MultiIndex +from pandas.core.index import Index, MultiIndex class ReshapeError(Exception): @@ -805,7 +805,7 @@ def convert_dummies(data, cat_variables, prefix_sep='_'): return result -def get_dummies(data, prefix=None, prefix_sep='_'): +def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): """ Convert categorical variable into dummy/indicator variables @@ -816,19 +816,67 @@ def get_dummies(data, prefix=None, prefix_sep='_'): String to append DataFrame column names prefix_sep : string, default '_' If appending prefix, separator/delimiter to use + dummy_na : bool, default False + Add a column to indicate NaNs, if False NaNs are ignored. Returns ------- dummies : DataFrame + + Examples + -------- + >>> s = pd.Series(list('abca')) + + >>> get_dummies(s) + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + + >>> s1 = ['a', 'b', np.nan] + + >>> get_dummies(s1) + a b + 0 1 0 + 1 0 1 + 2 0 0 + + >>> get_dummies(s1, dummy_na=True) + a b NaN + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + """ - cat = Categorical.from_array(np.asarray(data)) - dummy_mat = np.eye(len(cat.levels)).take(cat.labels, axis=0) + cat = Categorical.from_array(Series(data)) # Series avoids inconsistent NaN handling + levels = cat.levels + + # if all NaN + if not dummy_na and len(levels) == 0: + if isinstance(data, Series): + index = data.index + else: + index = np.arange(len(data)) + return DataFrame(index=index) + + number_of_cols = len(levels) + if dummy_na: + number_of_cols += 1 + + dummy_mat = np.eye(number_of_cols).take(cat.labels, axis=0) + + if dummy_na: + levels = np.append(cat.levels, np.nan) + else: + # reset NaN GH4446 + dummy_mat[cat.labels == -1] = 0 if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, str(v)) - for v in cat.levels] + for v in levels] else: - dummy_cols = cat.levels + dummy_cols = levels if isinstance(data, Series): index = data.index diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 0c6c34ff4dc29..e17b8c2aa72c9 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -7,13 +7,15 @@ import nose -from pandas import DataFrame +from pandas import DataFrame, Series import pandas as pd from numpy import nan import numpy as np -from pandas.core.reshape import melt, convert_dummies, lreshape +from pandas.util.testing import assert_frame_equal + +from pandas.core.reshape import melt, convert_dummies, lreshape, get_dummies import pandas.util.testing as tm from pandas.compat import StringIO, cPickle, range @@ -145,6 +147,60 @@ def test_multiindex(self): self.assertEqual(res.columns.tolist(), ['CAP', 'low', 'value']) +class TestGetDummies(unittest.TestCase): + def test_basic(self): + s_list = list('abc') + s_series = Series(s_list) + s_series_index = Series(s_list, list('ABC')) + + expected = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}, + 'c': {0: 0.0, 1: 0.0, 2: 1.0}}) + assert_frame_equal(get_dummies(s_list), expected) + assert_frame_equal(get_dummies(s_series), expected) + + expected.index = list('ABC') + assert_frame_equal(get_dummies(s_series_index), expected) + + def test_just_na(self): + just_na_list = [np.nan] + just_na_series = Series(just_na_list) + just_na_series_index = Series(just_na_list, index = ['A']) + + res_list = get_dummies(just_na_list) + res_series = get_dummies(just_na_series) + res_series_index = get_dummies(just_na_series_index) + + self.assertEqual(res_list.empty, True) + self.assertEqual(res_series.empty, True) + self.assertEqual(res_series_index.empty, True) + + self.assertEqual(res_list.index.tolist(), [0]) + self.assertEqual(res_series.index.tolist(), [0]) + self.assertEqual(res_series_index.index.tolist(), ['A']) + + def test_include_na(self): + s = ['a', 'b', np.nan] + res = get_dummies(s) + exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + assert_frame_equal(res, exp) + + res_na = get_dummies(s, dummy_na=True) + exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, + 'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}}).iloc[:, [1, 2, 0]] + # hack (NaN handling in assert_index_equal) + exp_na.columns = res_na.columns + assert_frame_equal(res_na, exp_na) + + res_just_na = get_dummies([nan], dummy_na=True) + exp_just_na = DataFrame({nan: {0: 1.0}}) + # hack (NaN handling in assert_index_equal) + exp_just_na.columns = res_just_na.columns + assert_frame_equal(res_just_na, exp_just_na) + + class TestConvertDummies(unittest.TestCase): def test_convert_dummies(self): df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',