diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 5da0f4fd07819..7ed0b1c800183 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -332,3 +332,97 @@ using something similar to the following: See `the NumPy documentation on byte order `__ for more details. + + +Alternative to storing lists in Pandas DataFrame Cells +------------------------------------------------------ +Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat DataFrame structure. + +Example of exploding nested lists into a DataFrame: + +.. ipython:: python + + from collections import OrderedDict + df = (pd.DataFrame(OrderedDict([('name', ['A.J. Price']*3), + ('opponent', ['76ers', 'blazers', 'bobcats']), + ('attribute x', ['A','B','C']) + ]) + )) + df + + nn = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3 + nn + + # Step 1: Create an index with the "parent" columns to be included in the final Dataframe + df2 = pd.concat([df[['name','opponent']], pd.DataFrame(nn)], axis=1) + df2 + + # Step 2: Transform the column with lists into series, which become columns in a new Dataframe. + # Note that only the index from the original df is retained - + # any other columns in the original df are not part of the new df + df3 = df2.set_index(['name', 'opponent']) + df3 + + # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. + # Note that at this point we have a Series, not a Dataframe + ser = df3.stack() + ser + + # Step 4: Drop the extraneous index level created by the stack + ser.reset_index(level=2, drop=True, inplace=True) + ser + + # Step 5: Create a Dataframe from the Series + df4 = ser.to_frame('nearest_neighbors') + df4 + + # All steps in one stack + df4 = (df2.set_index(['name', 'opponent']) + .stack() + .reset_index(level=2, drop=True) + .to_frame('nearest_neighbors')) + df4 + +Example of exploding a list embedded in a dataframe: + +.. ipython:: python + + df = (pd.DataFrame(OrderedDict([('name', ['A.J. Price']*3), + ('opponent', ['76ers', 'blazers', 'bobcats']), + ('attribute x', ['A','B','C']), + ('nearest_neighbors', [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3) + ]) + )) + + df + + # Step 1: Create an index with the "parent" columns to be included in the final Dataframe + df2 = df.set_index(['name', 'opponent']) + df2 + + # Step 2: Transform the column with lists into series, which become columns in a new Dataframe. + # Note that only the index from the original df is retained - + # any other columns in the original df are not part of the new df + df3 = df2.nearest_neighbors.apply(pd.Series) + df3 + + # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. + # Note that at this point we have a Series, not a Dataframe + ser = df3.stack() + ser + + # Step 4: Drop the extraneous index level created by the stack + ser.reset_index(level=2, drop=True, inplace=True) + ser + + # Step 5: Create a Dataframe from the Series + df4 = ser.to_frame('nearest_neighbors') + df4 + + # All steps in one stack + df4 = (df.set_index(['name', 'opponent']) + .nearest_neighbors.apply(pd.Series) + .stack() + .reset_index(level=2, drop=True) + .to_frame('nearest_neighbors')) + df4 diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 552ddabb7359a..1f9befb4bd59c 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -915,13 +915,17 @@ The dimension of the returned result can also change: So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in the output as well as set the indices. -.. warning:: +.. warnings:: - In the current implementation apply calls func twice on the + * In the current implementation apply calls func twice on the first group to decide whether it can take a fast or slow code path. This can lead to unexpected behavior if func has side-effects, as they will take effect twice for the first group. + + * Apply should not perform in-place operations on the group chunk. + Group chunks should be treated as immutable, and changes to a + group chunk may produce unexpected results. .. ipython:: python @@ -955,6 +959,42 @@ will be (silently) dropped. Thus, this does not pose any problems: df.groupby('A').std() +.. note:: + Decimal columns are also "nuisance" columns. They are excluded from aggregate functions automatically in groupby. + + If you do wish to include decimal columns in the aggregation, you must do so explicitly: + +.. ipython:: python + + from decimal import Decimal + dec = pd.DataFrame( + {'name': ['foo', 'bar', 'foo', 'bar'], + 'title': ['boo', 'far', 'boo', 'far'], + 'id': [123, 456, 123, 456], + 'int_column': [1, 2, 3, 4], + 'dec_column1': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')], + 'dec_column2': [Decimal('0.20'), Decimal('0.30'), Decimal('0.55'), Decimal('0.60')] + }, + columns=['name','title','id','int_column','dec_column1','dec_column2'] + ) + + dec.head() + + dec.dtypes + + # Decimal columns excluded from sum by default + dec.groupby(['name', 'title', 'id'], as_index=False).sum() + + # Decimal columns can be sum'd explicitly by themselves... + dec.groupby(['name', 'title', 'id'], as_index=False)['dec_column1','dec_column2'].sum() + + # ...but cannot be combined with standard data types or they will be excluded + dec.groupby(['name', 'title', 'id'], as_index=False)['int_column','dec_column1','dec_column2'].sum() + + # Use .agg function to aggregate over standard and "nuisance" data types at the same time + dec.groupby(['name', 'title', 'id'], as_index=False).agg({'int_column': 'sum', 'dec_column1': 'sum', 'dec_column2': 'sum'}) + + .. _groupby.missing: NA and NaT group handling