diff --git a/pandas/core/api.py b/pandas/core/api.py index 2f818a400162b..1f46aaa40e9eb 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -24,8 +24,8 @@ from pandas.core.panel import Panel, WidePanel from pandas.core.panel4d import Panel4D from pandas.core.reshape.reshape import ( - pivot_simple as pivot, get_dummies, - lreshape, wide_to_long) + pivot_simple as pivot, get_dummies) +from pandas.core.reshape.melt import lreshape, wide_to_long from pandas.core.indexing import IndexSlice from pandas.core.tools.numeric import to_numeric diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 70f1ff0a5380d..ded3c51edbece 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4637,7 +4637,7 @@ def unstack(self, level=-1, fill_value=None): other='melt')) def melt(self, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): - from pandas.core.reshape.reshape import melt + from pandas.core.reshape.melt import melt return melt(self, id_vars=id_vars, value_vars=value_vars, var_name=var_name, value_name=value_name, col_level=col_level) diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py index c75e0341918bb..99286d807a205 100644 --- a/pandas/core/reshape/api.py +++ b/pandas/core/reshape/api.py @@ -1,7 +1,7 @@ # flake8: noqa from pandas.core.reshape.concat import concat -from pandas.core.reshape.reshape import melt +from pandas.core.reshape.melt import melt from pandas.core.reshape.merge import ( merge, ordered_merge, merge_ordered, merge_asof) from pandas.core.reshape.pivot import pivot_table, crosstab diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py new file mode 100644 index 0000000000000..846d04221fe7f --- /dev/null +++ b/pandas/core/reshape/melt.py @@ -0,0 +1,386 @@ +# pylint: disable=E1101,E1103 +# pylint: disable=W0703,W0622,W0613,W0201 +import numpy as np + +from pandas.core.dtypes.common import is_list_like +from pandas import compat +from pandas.core.categorical import Categorical + +from pandas.core.frame import DataFrame +from pandas.core.index import MultiIndex + +from pandas.core.frame import _shared_docs +from pandas.util._decorators import Appender + +import re +import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.missing import notna + + +@Appender(_shared_docs['melt'] % + dict(caller='pd.melt(df, ', + versionadded="", + other='DataFrame.melt')) +def melt(frame, id_vars=None, value_vars=None, var_name=None, + value_name='value', col_level=None): + # TODO: what about the existing index? + if id_vars is not None: + if not is_list_like(id_vars): + id_vars = [id_vars] + elif (isinstance(frame.columns, MultiIndex) and + not isinstance(id_vars, list)): + raise ValueError('id_vars must be a list of tuples when columns' + ' are a MultiIndex') + else: + id_vars = list(id_vars) + else: + id_vars = [] + + if value_vars is not None: + if not is_list_like(value_vars): + value_vars = [value_vars] + elif (isinstance(frame.columns, MultiIndex) and + not isinstance(value_vars, list)): + raise ValueError('value_vars must be a list of tuples when' + ' columns are a MultiIndex') + else: + value_vars = list(value_vars) + frame = frame.loc[:, id_vars + value_vars] + else: + frame = frame.copy() + + if col_level is not None: # allow list or other? + # frame is a copy + frame.columns = frame.columns.get_level_values(col_level) + + if var_name is None: + if isinstance(frame.columns, MultiIndex): + if len(frame.columns.names) == len(set(frame.columns.names)): + var_name = frame.columns.names + else: + var_name = ['variable_{i}'.format(i=i) + for i in range(len(frame.columns.names))] + else: + var_name = [frame.columns.name if frame.columns.name is not None + else 'variable'] + if isinstance(var_name, compat.string_types): + var_name = [var_name] + + N, K = frame.shape + K -= len(id_vars) + + mdata = {} + for col in id_vars: + mdata[col] = np.tile(frame.pop(col).values, K) + + mcolumns = id_vars + var_name + [value_name] + + mdata[value_name] = frame.values.ravel('F') + for i, col in enumerate(var_name): + # asanyarray will keep the columns as an Index + mdata[col] = np.asanyarray(frame.columns + ._get_level_values(i)).repeat(N) + + return DataFrame(mdata, columns=mcolumns) + + +def lreshape(data, groups, dropna=True, label=None): + """ + Reshape long-format data to wide. Generalized inverse of DataFrame.pivot + + Parameters + ---------- + data : DataFrame + groups : dict + {new_name : list_of_columns} + dropna : boolean, default True + + Examples + -------- + >>> import pandas as pd + >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], + ... 'team': ['Red Sox', 'Yankees'], + ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) + >>> data + hr1 hr2 team year1 year2 + 0 514 545 Red Sox 2007 2008 + 1 573 526 Yankees 2007 2008 + + >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) + team year hr + 0 Red Sox 2007 514 + 1 Yankees 2007 573 + 2 Red Sox 2008 545 + 3 Yankees 2008 526 + + Returns + ------- + reshaped : DataFrame + """ + if isinstance(groups, dict): + keys = list(groups.keys()) + values = list(groups.values()) + else: + keys, values = zip(*groups) + + all_cols = list(set.union(*[set(x) for x in values])) + id_cols = list(data.columns.difference(all_cols)) + + K = len(values[0]) + + for seq in values: + if len(seq) != K: + raise ValueError('All column lists must be same length') + + mdata = {} + pivot_cols = [] + + for target, names in zip(keys, values): + to_concat = [data[col].values for col in names] + mdata[target] = _concat._concat_compat(to_concat) + pivot_cols.append(target) + + for col in id_cols: + mdata[col] = np.tile(data[col].values, K) + + if dropna: + mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) + for c in pivot_cols: + mask &= notna(mdata[c]) + if not mask.all(): + mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) + + return DataFrame(mdata, columns=id_cols + pivot_cols) + + +def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'): + r""" + Wide panel to long format. Less flexible but more user-friendly than melt. + + With stubnames ['A', 'B'], this function expects to find one or more + group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,... + You specify what you want to call this suffix in the resulting long format + with `j` (for example `j='year'`) + + Each row of these wide variables are assumed to be uniquely identified by + `i` (can be a single column name or a list of column names) + + All remaining variables in the data frame are left intact. + + Parameters + ---------- + df : DataFrame + The wide-format DataFrame + stubnames : str or list-like + The stub name(s). The wide format variables are assumed to + start with the stub names. + i : str or list-like + Column(s) to use as id variable(s) + j : str + The name of the subobservation variable. What you wish to name your + suffix in the long format. + sep : str, default "" + A character indicating the separation of the variable names + in the wide format, to be stripped from the names in the long format. + For example, if your column names are A-suffix1, A-suffix2, you + can strip the hypen by specifying `sep='-'` + + .. versionadded:: 0.20.0 + + suffix : str, default '\\d+' + A regular expression capturing the wanted suffixes. '\\d+' captures + numeric suffixes. Suffixes with no numbers could be specified with the + negated character class '\\D+'. You can also further disambiguate + suffixes, for example, if your wide variables are of the form + Aone, Btwo,.., and you have an unrelated column Arating, you can + ignore the last one by specifying `suffix='(!?one|two)'` + + .. versionadded:: 0.20.0 + + Returns + ------- + DataFrame + A DataFrame that contains each stub name as a variable, with new index + (i, j) + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> np.random.seed(123) + >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, + ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, + ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, + ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, + ... "X" : dict(zip(range(3), np.random.randn(3))) + ... }) + >>> df["id"] = df.index + >>> df + A1970 A1980 B1970 B1980 X id + 0 a d 2.5 3.2 -1.085631 0 + 1 b e 1.2 1.3 0.997345 1 + 2 c f 0.7 0.1 0.282978 2 + >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") + ... # doctest: +NORMALIZE_WHITESPACE + X A B + id year + 0 1970 -1.085631 a 2.5 + 1 1970 0.997345 b 1.2 + 2 1970 0.282978 c 0.7 + 0 1980 -1.085631 d 3.2 + 1 1980 0.997345 e 1.3 + 2 1980 0.282978 f 0.1 + + With multuple id columns + + >>> df = pd.DataFrame({ + ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + ... }) + >>> df + birth famid ht1 ht2 + 0 1 1 2.8 3.4 + 1 2 1 2.9 3.8 + 2 3 1 2.2 2.9 + 3 1 2 2.0 3.2 + 4 2 2 1.8 2.8 + 5 3 2 1.9 2.4 + 6 1 3 2.2 3.3 + 7 2 3 2.3 3.4 + 8 3 3 2.1 2.9 + >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') + >>> l + ... # doctest: +NORMALIZE_WHITESPACE + ht + famid birth age + 1 1 1 2.8 + 2 3.4 + 2 1 2.9 + 2 3.8 + 3 1 2.2 + 2 2.9 + 2 1 1 2.0 + 2 3.2 + 2 1 1.8 + 2 2.8 + 3 1 1.9 + 2 2.4 + 3 1 1 2.2 + 2 3.3 + 2 1 2.3 + 2 3.4 + 3 1 2.1 + 2 2.9 + + Going from long back to wide just takes some creative use of `unstack` + + >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack() + >>> w.columns = pd.Index(w.columns).str.join('') + >>> w.reset_index() + famid birth ht1 ht2 + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + + Less wieldy column names are also handled + + >>> np.random.seed(0) + >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), + ... 'A(quarterly)-2011': np.random.rand(3), + ... 'B(quarterly)-2010': np.random.rand(3), + ... 'B(quarterly)-2011': np.random.rand(3), + ... 'X' : np.random.randint(3, size=3)}) + >>> df['id'] = df.index + >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 ... + 0 0.548814 0.544883 0.437587 ... + 1 0.715189 0.423655 0.891773 ... + 2 0.602763 0.645894 0.963663 ... + X id + 0 0 0 + 1 1 1 + 2 1 2 + + >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id', + ... j='year', sep='-') + ... # doctest: +NORMALIZE_WHITESPACE + X A(quarterly) B(quarterly) + id year + 0 2010 0 0.548814 0.437587 + 1 2010 1 0.715189 0.891773 + 2 2010 1 0.602763 0.963663 + 0 2011 0 0.544883 0.383442 + 1 2011 1 0.423655 0.791725 + 2 2011 1 0.645894 0.528895 + + If we have many columns, we could also use a regex to find our + stubnames and pass that list on to wide_to_long + + >>> stubnames = sorted( + ... set([match[0] for match in df.columns.str.findall( + ... r'[A-B]\(.*\)').values if match != [] ]) + ... ) + >>> list(stubnames) + ['A(quarterly)', 'B(quarterly)'] + + Notes + ----- + All extra variables are left untouched. This simply uses + `pandas.melt` under the hood, but is hard-coded to "do the right thing" + in a typicaly case. + """ + def get_var_names(df, stub, sep, suffix): + regex = "^{stub}{sep}{suffix}".format( + stub=re.escape(stub), sep=re.escape(sep), suffix=suffix) + return df.filter(regex=regex).columns.tolist() + + def melt_stub(df, stub, i, j, value_vars, sep): + newdf = melt(df, id_vars=i, value_vars=value_vars, + value_name=stub.rstrip(sep), var_name=j) + newdf[j] = Categorical(newdf[j]) + newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") + + return newdf.set_index(i + [j]) + + if any(map(lambda s: s in df.columns.tolist(), stubnames)): + raise ValueError("stubname can't be identical to a column name") + + if not is_list_like(stubnames): + stubnames = [stubnames] + else: + stubnames = list(stubnames) + + if not is_list_like(i): + i = [i] + else: + i = list(i) + + if df[i].duplicated().any(): + raise ValueError("the id variables need to uniquely identify each row") + + value_vars = list(map(lambda stub: + get_var_names(df, stub, sep, suffix), stubnames)) + + value_vars_flattened = [e for sublist in value_vars for e in sublist] + id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) + + melted = [] + for s, v in zip(stubnames, value_vars): + melted.append(melt_stub(df, s, i, j, v, sep)) + melted = melted[0].join(melted[1:], how='outer') + + if len(i) == 1: + new = df[id_vars].set_index(i).join(melted) + return new + + new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) + + return new diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index b8885820f4a49..96738afbca9e3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -4,7 +4,6 @@ from pandas import compat from functools import partial import itertools -import re import numpy as np @@ -14,7 +13,6 @@ needs_i8_conversion, is_sparse) from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.missing import notna -import pandas.core.dtypes.concat as _concat from pandas.core.series import Series from pandas.core.frame import DataFrame @@ -30,8 +28,6 @@ import pandas.core.algorithms as algos from pandas._libs import algos as _algos, reshape as _reshape -from pandas.core.frame import _shared_docs -from pandas.util._decorators import Appender from pandas.core.index import Index, MultiIndex, _get_na_value @@ -700,375 +696,6 @@ def _convert_level_number(level_num, columns): return result -@Appender(_shared_docs['melt'] % - dict(caller='pd.melt(df, ', - versionadded="", - other='DataFrame.melt')) -def melt(frame, id_vars=None, value_vars=None, var_name=None, - value_name='value', col_level=None): - # TODO: what about the existing index? - if id_vars is not None: - if not is_list_like(id_vars): - id_vars = [id_vars] - elif (isinstance(frame.columns, MultiIndex) and - not isinstance(id_vars, list)): - raise ValueError('id_vars must be a list of tuples when columns' - ' are a MultiIndex') - else: - id_vars = list(id_vars) - else: - id_vars = [] - - if value_vars is not None: - if not is_list_like(value_vars): - value_vars = [value_vars] - elif (isinstance(frame.columns, MultiIndex) and - not isinstance(value_vars, list)): - raise ValueError('value_vars must be a list of tuples when' - ' columns are a MultiIndex') - else: - value_vars = list(value_vars) - frame = frame.loc[:, id_vars + value_vars] - else: - frame = frame.copy() - - if col_level is not None: # allow list or other? - # frame is a copy - frame.columns = frame.columns.get_level_values(col_level) - - if var_name is None: - if isinstance(frame.columns, MultiIndex): - if len(frame.columns.names) == len(set(frame.columns.names)): - var_name = frame.columns.names - else: - var_name = ['variable_{i}'.format(i=i) - for i in range(len(frame.columns.names))] - else: - var_name = [frame.columns.name if frame.columns.name is not None - else 'variable'] - if isinstance(var_name, compat.string_types): - var_name = [var_name] - - N, K = frame.shape - K -= len(id_vars) - - mdata = {} - for col in id_vars: - mdata[col] = np.tile(frame.pop(col).values, K) - - mcolumns = id_vars + var_name + [value_name] - - mdata[value_name] = frame.values.ravel('F') - for i, col in enumerate(var_name): - # asanyarray will keep the columns as an Index - mdata[col] = np.asanyarray(frame.columns - ._get_level_values(i)).repeat(N) - - return DataFrame(mdata, columns=mcolumns) - - -def lreshape(data, groups, dropna=True, label=None): - """ - Reshape long-format data to wide. Generalized inverse of DataFrame.pivot - - Parameters - ---------- - data : DataFrame - groups : dict - {new_name : list_of_columns} - dropna : boolean, default True - - Examples - -------- - >>> import pandas as pd - >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], - ... 'team': ['Red Sox', 'Yankees'], - ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) - >>> data - hr1 hr2 team year1 year2 - 0 514 545 Red Sox 2007 2008 - 1 573 526 Yankees 2007 2008 - - >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) - team year hr - 0 Red Sox 2007 514 - 1 Yankees 2007 573 - 2 Red Sox 2008 545 - 3 Yankees 2008 526 - - Returns - ------- - reshaped : DataFrame - """ - if isinstance(groups, dict): - keys = list(groups.keys()) - values = list(groups.values()) - else: - keys, values = zip(*groups) - - all_cols = list(set.union(*[set(x) for x in values])) - id_cols = list(data.columns.difference(all_cols)) - - K = len(values[0]) - - for seq in values: - if len(seq) != K: - raise ValueError('All column lists must be same length') - - mdata = {} - pivot_cols = [] - - for target, names in zip(keys, values): - to_concat = [data[col].values for col in names] - mdata[target] = _concat._concat_compat(to_concat) - pivot_cols.append(target) - - for col in id_cols: - mdata[col] = np.tile(data[col].values, K) - - if dropna: - mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) - for c in pivot_cols: - mask &= notna(mdata[c]) - if not mask.all(): - mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) - - return DataFrame(mdata, columns=id_cols + pivot_cols) - - -def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'): - r""" - Wide panel to long format. Less flexible but more user-friendly than melt. - - With stubnames ['A', 'B'], this function expects to find one or more - group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,... - You specify what you want to call this suffix in the resulting long format - with `j` (for example `j='year'`) - - Each row of these wide variables are assumed to be uniquely identified by - `i` (can be a single column name or a list of column names) - - All remaining variables in the data frame are left intact. - - Parameters - ---------- - df : DataFrame - The wide-format DataFrame - stubnames : str or list-like - The stub name(s). The wide format variables are assumed to - start with the stub names. - i : str or list-like - Column(s) to use as id variable(s) - j : str - The name of the subobservation variable. What you wish to name your - suffix in the long format. - sep : str, default "" - A character indicating the separation of the variable names - in the wide format, to be stripped from the names in the long format. - For example, if your column names are A-suffix1, A-suffix2, you - can strip the hypen by specifying `sep='-'` - - .. versionadded:: 0.20.0 - - suffix : str, default '\\d+' - A regular expression capturing the wanted suffixes. '\\d+' captures - numeric suffixes. Suffixes with no numbers could be specified with the - negated character class '\\D+'. You can also further disambiguate - suffixes, for example, if your wide variables are of the form - Aone, Btwo,.., and you have an unrelated column Arating, you can - ignore the last one by specifying `suffix='(!?one|two)'` - - .. versionadded:: 0.20.0 - - Returns - ------- - DataFrame - A DataFrame that contains each stub name as a variable, with new index - (i, j) - - Examples - -------- - >>> import pandas as pd - >>> import numpy as np - >>> np.random.seed(123) - >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, - ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, - ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, - ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, - ... "X" : dict(zip(range(3), np.random.randn(3))) - ... }) - >>> df["id"] = df.index - >>> df - A1970 A1980 B1970 B1980 X id - 0 a d 2.5 3.2 -1.085631 0 - 1 b e 1.2 1.3 0.997345 1 - 2 c f 0.7 0.1 0.282978 2 - >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") - ... # doctest: +NORMALIZE_WHITESPACE - X A B - id year - 0 1970 -1.085631 a 2.5 - 1 1970 0.997345 b 1.2 - 2 1970 0.282978 c 0.7 - 0 1980 -1.085631 d 3.2 - 1 1980 0.997345 e 1.3 - 2 1980 0.282978 f 0.1 - - With multuple id columns - - >>> df = pd.DataFrame({ - ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], - ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], - ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], - ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] - ... }) - >>> df - birth famid ht1 ht2 - 0 1 1 2.8 3.4 - 1 2 1 2.9 3.8 - 2 3 1 2.2 2.9 - 3 1 2 2.0 3.2 - 4 2 2 1.8 2.8 - 5 3 2 1.9 2.4 - 6 1 3 2.2 3.3 - 7 2 3 2.3 3.4 - 8 3 3 2.1 2.9 - >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') - >>> l - ... # doctest: +NORMALIZE_WHITESPACE - ht - famid birth age - 1 1 1 2.8 - 2 3.4 - 2 1 2.9 - 2 3.8 - 3 1 2.2 - 2 2.9 - 2 1 1 2.0 - 2 3.2 - 2 1 1.8 - 2 2.8 - 3 1 1.9 - 2 2.4 - 3 1 1 2.2 - 2 3.3 - 2 1 2.3 - 2 3.4 - 3 1 2.1 - 2 2.9 - - Going from long back to wide just takes some creative use of `unstack` - - >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack() - >>> w.columns = pd.Index(w.columns).str.join('') - >>> w.reset_index() - famid birth ht1 ht2 - 0 1 1 2.8 3.4 - 1 1 2 2.9 3.8 - 2 1 3 2.2 2.9 - 3 2 1 2.0 3.2 - 4 2 2 1.8 2.8 - 5 2 3 1.9 2.4 - 6 3 1 2.2 3.3 - 7 3 2 2.3 3.4 - 8 3 3 2.1 2.9 - - Less wieldy column names are also handled - - >>> np.random.seed(0) - >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), - ... 'A(quarterly)-2011': np.random.rand(3), - ... 'B(quarterly)-2010': np.random.rand(3), - ... 'B(quarterly)-2011': np.random.rand(3), - ... 'X' : np.random.randint(3, size=3)}) - >>> df['id'] = df.index - >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS - A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 ... - 0 0.548814 0.544883 0.437587 ... - 1 0.715189 0.423655 0.891773 ... - 2 0.602763 0.645894 0.963663 ... - X id - 0 0 0 - 1 1 1 - 2 1 2 - - >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id', - ... j='year', sep='-') - ... # doctest: +NORMALIZE_WHITESPACE - X A(quarterly) B(quarterly) - id year - 0 2010 0 0.548814 0.437587 - 1 2010 1 0.715189 0.891773 - 2 2010 1 0.602763 0.963663 - 0 2011 0 0.544883 0.383442 - 1 2011 1 0.423655 0.791725 - 2 2011 1 0.645894 0.528895 - - If we have many columns, we could also use a regex to find our - stubnames and pass that list on to wide_to_long - - >>> stubnames = sorted( - ... set([match[0] for match in df.columns.str.findall( - ... r'[A-B]\(.*\)').values if match != [] ]) - ... ) - >>> list(stubnames) - ['A(quarterly)', 'B(quarterly)'] - - Notes - ----- - All extra variables are left untouched. This simply uses - `pandas.melt` under the hood, but is hard-coded to "do the right thing" - in a typicaly case. - """ - def get_var_names(df, stub, sep, suffix): - regex = "^{stub}{sep}{suffix}".format( - stub=re.escape(stub), sep=re.escape(sep), suffix=suffix) - return df.filter(regex=regex).columns.tolist() - - def melt_stub(df, stub, i, j, value_vars, sep): - newdf = melt(df, id_vars=i, value_vars=value_vars, - value_name=stub.rstrip(sep), var_name=j) - newdf[j] = Categorical(newdf[j]) - newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") - - return newdf.set_index(i + [j]) - - if any(map(lambda s: s in df.columns.tolist(), stubnames)): - raise ValueError("stubname can't be identical to a column name") - - if not is_list_like(stubnames): - stubnames = [stubnames] - else: - stubnames = list(stubnames) - - if not is_list_like(i): - i = [i] - else: - i = list(i) - - if df[i].duplicated().any(): - raise ValueError("the id variables need to uniquely identify each row") - - value_vars = list(map(lambda stub: - get_var_names(df, stub, sep, suffix), stubnames)) - - value_vars_flattened = [e for sublist in value_vars for e in sublist] - id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) - - melted = [] - for s, v in zip(stubnames, value_vars): - melted.append(melt_stub(df, s, i, j, v, sep)) - melted = melted[0].join(melted[1:], how='outer') - - if len(i) == 1: - new = df[id_vars].set_index(i).join(melted) - return new - - new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) - - return new - - def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False): """ diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index fc9f89934b4ea..2722c3e92d85a 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -11,8 +11,8 @@ from pandas.util.testing import assert_frame_equal -from pandas.core.reshape.reshape import ( - melt, lreshape, get_dummies, wide_to_long) +from pandas.core.reshape.reshape import get_dummies +from pandas.core.reshape.melt import melt, lreshape, wide_to_long import pandas.util.testing as tm from pandas.compat import range, u