diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index bec7b5ff0e..1e8e8d578d 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -271,6 +271,40 @@ def json_extract_array( return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) +def struct(value: dataframe.DataFrame) -> series.Series: + """Takes a DataFrame and converts it into a Series of structs with each + struct entry corresponding to a DataFrame row and each struct field + corresponding to a DataFrame column + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import bigframes.series as series + >>> bpd.options.display.progress_bar = None + + >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},]) + >>> df = srs.struct.explode() + >>> bbq.struct(df) + 0 {'project': 'pandas', 'version': 1} + 1 {'project': 'numpy', 'version': 2} + dtype: struct[pyarrow] + + Args: + value (bigframes.dataframe.DataFrame): + The DataFrame to be converted to a Series of structs + + Returns: + bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame + """ + block = value._block + block, result_id = block.apply_nary_op( + block.value_columns, ops.StructOp(column_names=tuple(block.column_labels)) + ) + block = block.select_column(result_id) + return bigframes.series.Series(block) + + # Search functions defined from # https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 86501214ad..9e18b391d6 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1539,6 +1539,17 @@ def nary_remote_function_op_impl( return result +@scalar_op_compiler.register_nary_op(ops.StructOp, pass_op=True) +def struct_op_impl( + *values: ibis_types.Value, op: ops.StructOp +) -> ibis_types.StructValue: + data = {} + for i, value in enumerate(values): + data[op.column_names[i]] = value + + return ibis.struct(data) + + # Helpers def is_null(value) -> bool: # float NaN/inf should be treated as distinct from 'true' null values diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index cd9e70819e..51a962b13b 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -867,6 +867,29 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT case_when_op = CaseWhenOp() +@dataclasses.dataclass(frozen=True) +class StructOp(NaryOp): + name: typing.ClassVar[str] = "struct" + column_names: tuple[str] + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + num_input_types = len(input_types) + # value1, value2, ... + assert num_input_types == len(self.column_names) + fields = [] + + for i in range(num_input_types): + fields.append( + ( + self.column_names[i], + dtypes.bigframes_dtype_to_arrow_dtype(input_types[i]), + ) + ) + return pd.ArrowDtype( + pa.struct(fields) + ) # [(name1, value1), (name2, value2), ...] + + # Just parameterless unary ops for now # TODO: Parameter mappings NUMPY_TO_OP: typing.Final = { diff --git a/tests/system/small/bigquery/test_struct.py b/tests/system/small/bigquery/test_struct.py new file mode 100644 index 0000000000..58c822f642 --- /dev/null +++ b/tests/system/small/bigquery/test_struct.py @@ -0,0 +1,61 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +import bigframes.bigquery as bbq +import bigframes.series as series + + +@pytest.mark.parametrize( + "columns_arg", + [ + [ + {"version": 1, "project": "pandas"}, + {"version": 2, "project": "pandas"}, + {"version": 1, "project": "numpy"}, + ], + [ + {"version": 1, "project": "pandas"}, + {"version": None, "project": "pandas"}, + {"version": 1, "project": "numpy"}, + ], + [ + {"array": [6, 4, 6], "project": "pandas"}, + {"array": [6, 4, 7, 6], "project": "pandas"}, + {"array": [7, 2, 3], "project": "numpy"}, + ], + [ + {"array": [6, 4, 6], "project": "pandas"}, + {"array": [6, 4, 7, 6], "project": "pandas"}, + {"array": [7, 2, 3], "project": "numpy"}, + ], + [ + {"struct": [{"x": 2, "y": 4}], "project": "pandas"}, + {"struct": [{"x": 9, "y": 3}], "project": "pandas"}, + {"struct": [{"x": 1, "y": 2}], "project": "numpy"}, + ], + ], +) +def test_struct_from_dataframe(columns_arg): + srs = series.Series( + columns_arg, + ) + pd.testing.assert_series_equal( + srs.to_pandas(), + bbq.struct(srs.struct.explode()).to_pandas(), + check_index_type=False, + check_dtype=False, + )