From ade3841fb489de47fb10bd7182235b0978ec8777 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Wed, 20 May 2015 22:07:10 -0700 Subject: [PATCH 1/3] add DataFrame.rollup/cube in Python --- python/pyspark/sql/dataframe.py | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index f2280b5100e5..d9f0196044ab 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -755,6 +755,52 @@ def groupBy(self, *cols): from pyspark.sql.group import GroupedData return GroupedData(jdf, self.sql_ctx) + def rollup(self, *cols): + """ + Create a multi-dimensional rollup for the current :class:`DataFrame` using + the specified columns, so we can run aggregation on them. + + >>> df.rollup('name', df.age).count().show() + +-----+----+-----+ + | name| age|count| + +-----+----+-----+ + |Alice|null| 1| + | Bob| 5| 1| + | Bob|null| 1| + | null|null| 2| + |Alice| 2| 1| + +-----+----+-----+ + + .. versionadded:: 1.4 + """ + jdf = self._jdf.rollup(self._jcols(*cols)) + from pyspark.sql.group import GroupedData + return GroupedData(jdf, self.sql_ctx) + + def cube(self, *cols): + """ + Create a multi-dimensional cube for the current :class:`DataFrame` using + the specified columns, so we can run aggregation on them. + + >>> df.cube('name', df.age).count().show() + +-----+----+-----+ + | name| age|count| + +-----+----+-----+ + | null| 2| 1| + |Alice|null| 1| + | Bob| 5| 1| + | Bob|null| 1| + | null| 5| 1| + | null|null| 2| + |Alice| 2| 1| + +-----+----+-----+ + + .. versionadded:: 1.4 + """ + jdf = self._jdf.cube(self._jcols(*cols)) + from pyspark.sql.group import GroupedData + return GroupedData(jdf, self.sql_ctx) + def agg(self, *exprs): """ Aggregate on the entire :class:`DataFrame` without groups (shorthand for ``df.groupBy.agg()``). From 8ad5af405e79403ab5eb09f2addb19744d00ab07 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Wed, 20 May 2015 22:40:08 -0700 Subject: [PATCH 2/3] Update dataframe.py --- python/pyspark/sql/dataframe.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index d9f0196044ab..f87761b6933f 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -751,9 +751,9 @@ def groupBy(self, *cols): >>> df.groupBy(['name', df.age]).count().collect() [Row(name=u'Bob', age=5, count=1), Row(name=u'Alice', age=2, count=1)] """ - jdf = self._jdf.groupBy(self._jcols(*cols)) + jgd = self._jdf.groupBy(self._jcols(*cols)) from pyspark.sql.group import GroupedData - return GroupedData(jdf, self.sql_ctx) + return GroupedData(jgd, self.sql_ctx) def rollup(self, *cols): """ @@ -773,9 +773,9 @@ def rollup(self, *cols): .. versionadded:: 1.4 """ - jdf = self._jdf.rollup(self._jcols(*cols)) + jgd = self._jdf.rollup(self._jcols(*cols)) from pyspark.sql.group import GroupedData - return GroupedData(jdf, self.sql_ctx) + return GroupedData(jgd, self.sql_ctx) def cube(self, *cols): """ @@ -797,9 +797,9 @@ def cube(self, *cols): .. versionadded:: 1.4 """ - jdf = self._jdf.cube(self._jcols(*cols)) + jgd = self._jdf.cube(self._jcols(*cols)) from pyspark.sql.group import GroupedData - return GroupedData(jdf, self.sql_ctx) + return GroupedData(jgd, self.sql_ctx) def agg(self, *exprs): """ Aggregate on the entire :class:`DataFrame` without groups From 0261db1f1dd869a033766aef231bbe1e6492c32b Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Wed, 20 May 2015 23:51:48 -0700 Subject: [PATCH 3/3] use @since --- python/pyspark/sql/dataframe.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 76804b530456..132db90e69f5 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -805,6 +805,7 @@ def groupBy(self, *cols): from pyspark.sql.group import GroupedData return GroupedData(jgd, self.sql_ctx) + @since(1.4) def rollup(self, *cols): """ Create a multi-dimensional rollup for the current :class:`DataFrame` using @@ -820,13 +821,12 @@ def rollup(self, *cols): | null|null| 2| |Alice| 2| 1| +-----+----+-----+ - - .. versionadded:: 1.4 """ jgd = self._jdf.rollup(self._jcols(*cols)) from pyspark.sql.group import GroupedData return GroupedData(jgd, self.sql_ctx) + @since(1.4) def cube(self, *cols): """ Create a multi-dimensional cube for the current :class:`DataFrame` using @@ -844,8 +844,6 @@ def cube(self, *cols): | null|null| 2| |Alice| 2| 1| +-----+----+-----+ - - .. versionadded:: 1.4 """ jgd = self._jdf.cube(self._jcols(*cols)) from pyspark.sql.group import GroupedData