added test and fix for chained pandas_udf

BryanCutler · BryanCutler · commit b8ffa50132d0 · 2017-09-21T14:29:38.000-07:00
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3300,6 +3300,24 @@ def test_vectorized_udf_mix_udf(self):
                     'Can not mix vectorized and non-vectorized UDFs'):
                 df.select(row_by_row_udf(col('id')), pd_udf(col('id'))).collect()
 
+    def test_vectorized_udf_chained(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.range(10).toDF('x')
+        f = pandas_udf(lambda x: x + 1, LongType())
+        g = pandas_udf(lambda x: x - 1, LongType())
+        res = df.select(g(f(col('x'))))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_wrong_return_type(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.range(10).toDF('x')
+        f = pandas_udf(lambda x: x * 1.0, StringType())
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(
+                    Exception,
+                    'Invalid.*type.*string'):
+                df.select(f(col('x'))).collect()
+
 
 if __name__ == "__main__":
     from pyspark.sql.tests import *
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -60,9 +60,12 @@ def read_command(serializer, file):
     return command
 
 
-def chain(f, g):
-    """chain two function together """
-    return lambda *a: g(f(*a))
+def chain(f, g, eval_type):
+    """chain two functions together """
+    if eval_type == PythonEvalType.SQL_PANDAS_UDF:
+        return lambda *a, **kwargs: g(f(*a, **kwargs), **kwargs)
+    else:
+        return lambda *a: g(f(*a))
 
 
 def wrap_udf(f, return_type):
@@ -96,7 +99,7 @@ def read_single_udf(pickleSer, infile, eval_type):
         if row_func is None:
             row_func = f
         else:
-            row_func = chain(row_func, f)
+            row_func = chain(row_func, f, eval_type)
     # the last returnType will be the return type of UDF
     if eval_type == PythonEvalType.SQL_PANDAS_UDF:
         # A pandas_udf will take kwargs as the last argument