Skip to content

Commit 858ac9c

Browse files
changhiskhanjreback
authored andcommitted
[ENH] Add DataFrame method to explode a list-like column (GH pandas-dev#16538)
Sometimes a values column is presented with list-like values on one row. Instead we may want to split each individual value onto its own row, keeping the same mapping to the other key columns. While it's possible to chain together existing pandas operations (in fact that's exactly what this implementation is) to do this, the sequence of operations is not obvious. By contrast this is available as a built-in operation in say Spark and is a fairly common use case.
1 parent 6bc1cf4 commit 858ac9c

File tree

5 files changed

+225
-0
lines changed

5 files changed

+225
-0
lines changed

asv_bench/benchmarks/reshape.py

+18
Original file line numberDiff line numberDiff line change
@@ -240,4 +240,22 @@ def time_qcut_datetime(self, bins):
240240
pd.qcut(self.datetime_series, bins)
241241

242242

243+
class Explode(object):
244+
param_names = ['n_rows', 'max_list_length']
245+
params = [[100, 1000, 10000], [3, 5, 10]]
246+
247+
def setup(self, n_rows, max_list_length):
248+
import string
249+
num_letters = np.random.randint(0, max_list_length, n_rows)
250+
key_column = [','.join([np.random.choice(list(string.ascii_letters))
251+
for _ in range(k)])
252+
for k in num_letters]
253+
value_column = np.random.randn(n_rows)
254+
self.frame = pd.DataFrame({'key': key_column,
255+
'value': value_column})
256+
257+
def time_explode(self, n_rows, max_list_length):
258+
self.frame.explode('key', sep=',')
259+
260+
243261
from .pandas_vb_common import setup # noqa: F401

doc/source/user_guide/reshaping.rst

+31
Original file line numberDiff line numberDiff line change
@@ -801,3 +801,34 @@ Note to subdivide over multiple columns we can pass in a list to the
801801
802802
df.pivot_table(
803803
values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
804+
805+
.. _reshaping.explode:
806+
807+
Exploding a List-like Column
808+
----------------------------
809+
810+
Sometimes the value column is list-like:
811+
812+
.. ipython:: python
813+
814+
keys = ['panda1', 'panda2', 'panda3']
815+
values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']]
816+
df = pd.DataFrame({'keys': keys, 'values': values})
817+
df
818+
819+
But we actually want to put each value onto its own row.
820+
For this purpose we can use ``DataFrame.explode``:
821+
822+
.. ipython:: python
823+
824+
df.explode('values')
825+
826+
For convenience, we can use the optional keyword ``sep`` to automatically
827+
split a string column before exploding:
828+
829+
.. ipython:: python
830+
831+
values = ['eats,shoots', 'shoots,leaves', 'eats,shoots,leaves']
832+
df2 = pd.DataFrame({'keys': keys, 'values': values})
833+
df2
834+
df2.explode('values', sep=',')

doc/source/whatsnew/v0.24.0.rst

+30
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,37 @@ This is a major release from 0.23.4 and includes a number of API changes, new
1515
features, enhancements, and performance improvements along with a large number
1616
of bug fixes.
1717

18+
<<<<<<< HEAD
1819
Highlights include:
20+
=======
21+
These are the changes in pandas 0.24.0. See :ref:`release` for a full changelog
22+
including other versions of pandas.
23+
24+
.. _whatsnew_0240.enhancements:
25+
26+
New features
27+
~~~~~~~~~~~~
28+
- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`)
29+
- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
30+
- ``FrozenList`` has gained the ``.union()`` and ``.difference()`` methods. This functionality greatly simplifies groupby's that rely on explicitly excluding certain columns. See :ref:`Splitting an object into groups <groupby.split>` for more information (:issue:`15475`, :issue:`15506`).
31+
- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing
32+
the user to override the engine's default behavior to include or omit the
33+
dataframe's indexes from the resulting Parquet file. (:issue:`20768`)
34+
- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`)
35+
- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`)
36+
- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`)
37+
- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame.
38+
See the :ref:`section on writing HTML <io.html>` in the IO docs for example usage. (:issue:`2679`)
39+
- :func:`DataFrame.explode` to split list-like values onto individual rows. See :ref:`section on Exploding list-like column <reshaping.html>` in docs for more information (:issue:`16538`)
40+
41+
.. _whatsnew_0240.values_api:
42+
43+
Accessing the values in a Series or Index
44+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
45+
46+
:attr:`Series.array` and :attr:`Index.array` have been added for extracting the array backing a
47+
``Series`` or ``Index``. (:issue:`19954`, :issue:`23623`)
48+
>>>>>>> 2138ef063... [ENH] Add DataFrame method to explode a list-like column (GH #16538)
1949

2050
* :ref:`Optional Integer NA Support <whatsnew_0240.enhancements.intna>`
2151
* :ref:`New APIs for accessing the array backing a Series or Index <whatsnew_0240.values_api>`

pandas/core/frame.py

+51
Original file line numberDiff line numberDiff line change
@@ -6436,6 +6436,57 @@ def melt(
64366436
col_level=col_level,
64376437
)
64386438

6439+
def explode(self, col_name, sep=None, dtype=None):
6440+
"""
6441+
Create new DataFrame expanding a list-like column.
6442+
6443+
.. versionadded:: 0.24.0
6444+
6445+
Parameters
6446+
----------
6447+
col_name : str
6448+
Name of the column to be exploded.
6449+
sep : str, default None
6450+
Convenience to split a string `col_name` before exploding.
6451+
dtype : str or dtype, default None
6452+
Optionally coerce the dtype of exploded column.
6453+
6454+
Returns
6455+
-------
6456+
exploded: DataFrame
6457+
6458+
See Also
6459+
--------
6460+
Series.str.split: Split string values on specified separator.
6461+
Series.str.extract: Extract groups from the first regex match.
6462+
6463+
Examples
6464+
--------
6465+
>>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]})
6466+
>>> df.explode('k', sep=',')
6467+
k v
6468+
0 a 0
6469+
0 b 0
6470+
1 c 1
6471+
1 d 1
6472+
"""
6473+
col = self[col_name]
6474+
if len(self) == 0:
6475+
return self.copy()
6476+
if sep:
6477+
col_expanded = col.str.split(sep, expand=True)
6478+
else:
6479+
col_expanded = col.apply(Series)
6480+
col_stacked = (col_expanded
6481+
.stack()
6482+
.reset_index(level=-1, drop=True)
6483+
.rename(col_name))
6484+
if dtype:
6485+
col_stacked = col_stacked.astype(dtype)
6486+
return (col_stacked.to_frame()
6487+
.join(self.drop(col_name, axis=1))
6488+
.reindex(self.columns, axis=1))
6489+
64396490
# ----------------------------------------------------------------------
64406491
# Time series-related
64416492

pandas/tests/frame/test_reshape.py

+95
Original file line numberDiff line numberDiff line change
@@ -1043,6 +1043,101 @@ def test_unstack_swaplevel_sortlevel(self, level):
10431043
tm.assert_frame_equal(result, expected)
10441044

10451045

1046+
class TestDataFrameExplode(object):
1047+
# GH 16538
1048+
columns = ['a', 'b', 'c']
1049+
1050+
def test_sep(self):
1051+
# Automatically do str.split
1052+
df = pd.DataFrame([['foo,bar', 'x', 42],
1053+
['fizz,buzz', 'y', 43]],
1054+
columns=self.columns)
1055+
rs = df.explode('a', sep=',')
1056+
xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
1057+
'b': ['x', 'x', 'y', 'y'],
1058+
'c': [42, 42, 43, 43]},
1059+
index=[0, 0, 1, 1])
1060+
tm.assert_frame_equal(rs, xp)
1061+
1062+
def test_dtype(self):
1063+
# Coerce dtype
1064+
df = pd.DataFrame([[[0, 1, 4], 'x', 42],
1065+
[[2, 3], 'y', 43]],
1066+
columns=self.columns)
1067+
rs = df.explode('a', dtype='int')
1068+
xp = pd.DataFrame({'a': np.array([0, 1, 4, 2, 3], dtype='int'),
1069+
'b': ['x', 'x', 'x', 'y', 'y'],
1070+
'c': [42, 42, 42, 43, 43]},
1071+
index=[0, 0, 0, 1, 1])
1072+
tm.assert_frame_equal(rs, xp)
1073+
1074+
def test_na(self):
1075+
# NaN's and empty lists are omitted
1076+
# TODO: option to preserve explicit NAs instead
1077+
df = pd.DataFrame([[[], 'x', 42],
1078+
[[2.0, np.nan], 'y', 43]],
1079+
columns=self.columns)
1080+
rs = df.explode('a')
1081+
xp = pd.DataFrame({'a': [2.0],
1082+
'b': ['y'],
1083+
'c': [43]},
1084+
index=[1])
1085+
tm.assert_frame_equal(rs, xp)
1086+
1087+
def test_nonuniform_type(self):
1088+
# Not everything is a list
1089+
df = pd.DataFrame([[[0, 1, 4], 'x', 42],
1090+
[3, 'y', 43]],
1091+
columns=self.columns)
1092+
rs = df.explode('a', dtype='int')
1093+
xp = pd.DataFrame({'a': np.array([0, 1, 4, 3], dtype='int'),
1094+
'b': ['x', 'x', 'x', 'y'],
1095+
'c': [42, 42, 42, 43]},
1096+
index=[0, 0, 0, 1])
1097+
tm.assert_frame_equal(rs, xp)
1098+
1099+
def test_all_scalars(self):
1100+
# Nothing is a list
1101+
df = pd.DataFrame([[0, 'x', 42],
1102+
[3, 'y', 43]],
1103+
columns=self.columns)
1104+
rs = df.explode('a')
1105+
xp = pd.DataFrame({'a': [0, 3],
1106+
'b': ['x', 'y'],
1107+
'c': [42, 43]},
1108+
index=[0, 1])
1109+
tm.assert_frame_equal(rs, xp)
1110+
1111+
def test_empty(self):
1112+
# Empty frame
1113+
rs = pd.DataFrame(columns=['a', 'b']).explode('a')
1114+
xp = pd.DataFrame(columns=['a', 'b'])
1115+
tm.assert_frame_equal(rs, xp)
1116+
1117+
def test_missing_column(self):
1118+
# Bad column name
1119+
df = pd.DataFrame([[0, 'x', 42],
1120+
[3, 'y', 43]],
1121+
columns=self.columns)
1122+
pytest.raises(KeyError, df.explode, 'badcolumnname')
1123+
1124+
def test_multi_index(self):
1125+
# Multi-index
1126+
idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')])
1127+
df = pd.DataFrame([['foo,bar', 'x', 42],
1128+
['fizz,buzz', 'y', 43]],
1129+
columns=self.columns,
1130+
index=idx)
1131+
rs = df.explode('a', sep=',')
1132+
idx = pd.MultiIndex.from_tuples(
1133+
[(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')])
1134+
xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
1135+
'b': ['x', 'x', 'y', 'y'],
1136+
'c': [42, 42, 43, 43]},
1137+
index=idx)
1138+
tm.assert_frame_equal(rs, xp)
1139+
1140+
10461141
def test_unstack_fill_frame_object():
10471142
# GH12815 Test unstacking with object.
10481143
data = pd.Series(["a", "b", "c", "a"], dtype="object")

0 commit comments

Comments
 (0)