-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24561][SQL][Python] User-defined window aggregation functions with Pandas UDF (bounded window) #22305
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
431194d
1fbc2a2
2240032
8916284
db6f77a
9ae976c
d31a133
9b731f5
f982bf2
893cbc2
9de9513
d2be73b
12d3ae1
8c68727
6bfedd9
cc4c647
3ec05ab
05fac37
64db3b0
4df657a
903cbed
96b44df
2d24d5e
5d3bbd6
c2d574f
0408c26
04873bd
03702d4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -145,7 +145,18 @@ def wrapped(*series): | |
| return lambda *a: (wrapped(*a), arrow_return_type) | ||
|
|
||
|
|
||
| def wrap_window_agg_pandas_udf(f, return_type): | ||
| def wrap_window_agg_pandas_udf(f, return_type, runner_conf, udf_index): | ||
| window_bound_types_str = runner_conf.get('pandas_window_bound_types') | ||
| window_bound_type = [t.strip().lower() for t in window_bound_types_str.split(',')][udf_index] | ||
| if window_bound_type == 'bounded': | ||
| return wrap_bounded_window_agg_pandas_udf(f, return_type) | ||
| elif window_bound_type == 'unbounded': | ||
| return wrap_unbounded_window_agg_pandas_udf(f, return_type) | ||
| else: | ||
| raise RuntimeError("Invalid window bound type: {} ".format(window_bound_type)) | ||
|
|
||
|
|
||
| def wrap_unbounded_window_agg_pandas_udf(f, return_type): | ||
| # This is similar to grouped_agg_pandas_udf, the only difference | ||
| # is that window_agg_pandas_udf needs to repeat the return value | ||
| # to match window length, where grouped_agg_pandas_udf just returns | ||
|
|
@@ -160,7 +171,41 @@ def wrapped(*series): | |
| return lambda *a: (wrapped(*a), arrow_return_type) | ||
|
|
||
|
|
||
| def read_single_udf(pickleSer, infile, eval_type, runner_conf): | ||
| def wrap_bounded_window_agg_pandas_udf(f, return_type): | ||
| arrow_return_type = to_arrow_type(return_type) | ||
|
|
||
| def wrapped(begin_index, end_index, *series): | ||
| import pandas as pd | ||
| result = [] | ||
|
|
||
| # Index operation is faster on np.ndarray, | ||
| # So we turn the index series into np array | ||
| # here for performance | ||
| begin_array = begin_index.values | ||
| end_array = end_index.values | ||
|
|
||
| for i in range(len(begin_array)): | ||
| # Note: Create a slice from a series for each window is | ||
| # actually pretty expensive. However, there | ||
| # is no easy way to reduce cost here. | ||
| # Note: s.iloc[i : j] is about 30% faster than s[i: j], with | ||
| # the caveat that the created slices shares the same | ||
| # memory with s. Therefore, user are not allowed to | ||
| # change the value of input series inside the window | ||
| # function. It is rare that user needs to modify the | ||
| # input series in the window function, and therefore, | ||
| # it is be a reasonable restriction. | ||
| # Note: Calling reset_index on the slices will increase the cost | ||
| # of creating slices by about 100%. Therefore, for performance | ||
| # reasons we don't do it here. | ||
| series_slices = [s.iloc[begin_array[i]: end_array[i]] for s in series] | ||
| result.append(f(*series_slices)) | ||
| return pd.Series(result) | ||
|
|
||
| return lambda *a: (wrapped(*a), arrow_return_type) | ||
|
|
||
|
|
||
| def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index): | ||
| num_arg = read_int(infile) | ||
| arg_offsets = [read_int(infile) for i in range(num_arg)] | ||
| row_func = None | ||
|
|
@@ -184,7 +229,7 @@ def read_single_udf(pickleSer, infile, eval_type, runner_conf): | |
| elif eval_type == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: | ||
| return arg_offsets, wrap_grouped_agg_pandas_udf(func, return_type) | ||
| elif eval_type == PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF: | ||
| return arg_offsets, wrap_window_agg_pandas_udf(func, return_type) | ||
| return arg_offsets, wrap_window_agg_pandas_udf(func, return_type, runner_conf, udf_index) | ||
| elif eval_type == PythonEvalType.SQL_BATCHED_UDF: | ||
| return arg_offsets, wrap_udf(func, return_type) | ||
| else: | ||
|
|
@@ -226,7 +271,8 @@ def read_udfs(pickleSer, infile, eval_type): | |
|
|
||
| # See FlatMapGroupsInPandasExec for how arg_offsets are used to | ||
| # distinguish between grouping attributes and data attributes | ||
| arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf) | ||
| arg_offsets, udf = read_single_udf( | ||
| pickleSer, infile, eval_type, runner_conf, udf_index=0) | ||
| udfs['f'] = udf | ||
| split_offset = arg_offsets[0] + 1 | ||
| arg0 = ["a[%d]" % o for o in arg_offsets[1: split_offset]] | ||
|
|
@@ -238,7 +284,8 @@ def read_udfs(pickleSer, infile, eval_type): | |
| # In the special case of a single UDF this will return a single result rather | ||
| # than a tuple of results; this is the format that the JVM side expects. | ||
| for i in range(num_udfs): | ||
| arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf) | ||
| arg_offsets, udf = read_single_udf( | ||
| pickleSer, infile, eval_type, runner_conf, udf_index=i) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So right now, since only window udfs can be sent together the mapping of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand you correctly, do you mean that we maintain a mutable list of the remaining
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, basically this I'm not crazy about changing the conf inplace, but it wouldn't rely on any particular udf indexing then. Maybe it would make more sense to check the eval type before calling
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't a big deal though, so don't block merging if the rest is ready. It can be improved upon later if needed.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @BryanCutler I don't know which way is better.. if we pass Yeah I agree we can revisit this later if needed. |
||
| udfs['f%d' % i] = udf | ||
| args = ["a[%d]" % o for o in arg_offsets] | ||
| call_udf.append("f%d(%s)" % (i, ", ".join(args))) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this import and all the others should be moved to the top, it's repeated many times. It could done be a follow though
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SGTM. Opened https://jira.apache.org/jira/browse/SPARK-26364