Skip to content

feat: recover struct column from exploded Series #904

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
2be896e
feat: recover struct columns from exploded Series
Aug 19, 2024
8ed963b
Merge branch 'googleapis:main' into structs
mattyopl Aug 19, 2024
69e6273
feat: recover struct columns from exploded Series
Aug 19, 2024
bf27fda
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 19, 2024
954f2e6
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 19, 2024
e7b1985
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
49e1d35
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
f873f10
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
25d811c
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
15af959
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
5254f92
Merge branch 'main' into structs
mattyopl Aug 29, 2024
786405a
Merge branch 'main' into structs
mattyopl Aug 29, 2024
d1d2569
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
1d9f96d
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
bd8b423
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
f063f0c
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
3a68856
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
ca9d03e
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 29, 2024
4549e78
Merge branch 'structs' of github.com:mattyopl/python-bigquery-datafra…
Aug 30, 2024
a802709
Merge branch 'main' into structs
mattyopl Aug 30, 2024
27b8130
Merge branch 'main' into structs
mattyopl Aug 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,40 @@ def json_extract_array(
return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))


def struct(value: dataframe.DataFrame) -> series.Series:
"""Takes a DataFrame and converts it into a Series of structs with each
struct entry corresponding to a DataFrame row and each struct field
corresponding to a DataFrame column

**Examples:**

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> import bigframes.series as series
>>> bpd.options.display.progress_bar = None

>>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},])
>>> df = srs.struct.explode()
>>> bbq.struct(df)
0 {'project': 'pandas', 'version': 1}
1 {'project': 'numpy', 'version': 2}
dtype: struct<project: string, version: int64>[pyarrow]

Args:
value (bigframes.dataframe.DataFrame):
The DataFrame to be converted to a Series of structs

Returns:
bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame
"""
block = value._block
block, result_id = block.apply_nary_op(
block.value_columns, ops.StructOp(column_names=tuple(block.column_labels))
)
block = block.select_column(result_id)
return bigframes.series.Series(block)


# Search functions defined from
# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions

Expand Down
11 changes: 11 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1539,6 +1539,17 @@ def nary_remote_function_op_impl(
return result


@scalar_op_compiler.register_nary_op(ops.StructOp, pass_op=True)
def struct_op_impl(
*values: ibis_types.Value, op: ops.StructOp
) -> ibis_types.StructValue:
data = {}
for i, value in enumerate(values):
data[op.column_names[i]] = value

return ibis.struct(data)


# Helpers
def is_null(value) -> bool:
# float NaN/inf should be treated as distinct from 'true' null values
Expand Down
23 changes: 23 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,29 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
case_when_op = CaseWhenOp()


@dataclasses.dataclass(frozen=True)
class StructOp(NaryOp):
name: typing.ClassVar[str] = "struct"
column_names: tuple[str]

def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
num_input_types = len(input_types)
# value1, value2, ...
assert num_input_types == len(self.column_names)
fields = []

for i in range(num_input_types):
fields.append(
(
self.column_names[i],
dtypes.bigframes_dtype_to_arrow_dtype(input_types[i]),
)
)
return pd.ArrowDtype(
pa.struct(fields)
) # [(name1, value1), (name2, value2), ...]


# Just parameterless unary ops for now
# TODO: Parameter mappings
NUMPY_TO_OP: typing.Final = {
Expand Down
61 changes: 61 additions & 0 deletions tests/system/small/bigquery/test_struct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
import pytest

import bigframes.bigquery as bbq
import bigframes.series as series


@pytest.mark.parametrize(
"columns_arg",
[
[
{"version": 1, "project": "pandas"},
{"version": 2, "project": "pandas"},
{"version": 1, "project": "numpy"},
],
[
{"version": 1, "project": "pandas"},
{"version": None, "project": "pandas"},
{"version": 1, "project": "numpy"},
],
[
{"array": [6, 4, 6], "project": "pandas"},
{"array": [6, 4, 7, 6], "project": "pandas"},
{"array": [7, 2, 3], "project": "numpy"},
],
[
{"array": [6, 4, 6], "project": "pandas"},
{"array": [6, 4, 7, 6], "project": "pandas"},
{"array": [7, 2, 3], "project": "numpy"},
],
[
{"struct": [{"x": 2, "y": 4}], "project": "pandas"},
{"struct": [{"x": 9, "y": 3}], "project": "pandas"},
{"struct": [{"x": 1, "y": 2}], "project": "numpy"},
],
],
)
def test_struct_from_dataframe(columns_arg):
srs = series.Series(
columns_arg,
)
pd.testing.assert_series_equal(
srs.to_pandas(),
bbq.struct(srs.struct.explode()).to_pandas(),
check_index_type=False,
check_dtype=False,
)