From 992dc210d8a4ed3d1bf65e76297f0afe9c60bd89 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 7 Oct 2025 04:17:32 +0000 Subject: [PATCH 01/37] change to ai.generate --- bigframes/operations/blob.py | 12 +++++++----- tests/system/large/blob/test_function.py | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 1f6b75a8f5..038cc1d891 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -974,21 +974,23 @@ def audio_transcribe( prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio." - # Convert the audio series to the runtime representation required by the model. - audio_runtime = audio_series.blob._get_runtime("R", with_metadata=True) - + # Use bbq.ai.generate() to transcribe audio transcribed_results = bbq.ai.generate( - prompt=(prompt_text, audio_runtime), + prompt=(prompt_text, audio_series), connection_id=connection, endpoint=model_name, - model_params={"generationConfig": {"temperature": 0.0}}, + request_type="unspecified", ) + transcribed_content_series = transcribed_results.struct.field("result").rename( + "transcribed_content" + ) transcribed_content_series = transcribed_results.struct.field("result").rename( "transcribed_content" ) if verbose: + transcribed_status_series = transcribed_results.struct.field("status") transcribed_status_series = transcribed_results.struct.field("status") results_df = bpd.DataFrame( { diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 7963fabd0b..2124234649 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -768,6 +768,7 @@ def test_blob_transcribe( ) .to_pandas() ) + print(actual) # check relative length expected_text = "Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress" From 74e042a9ae286368a94840dbaee1d33dcafe673a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Sat, 4 Oct 2025 07:19:28 +0000 Subject: [PATCH 02/37] perf: Default to interactive display for SQL in anywidget mode Previously, SQL queries in anywidget mode would fall back to deferred execution, showing a dry run instead of an interactive table. This change modifies the display logic to directly use the anywidget interactive display for SQL queries, providing a more consistent and responsive user experience. A test case has been added to verify this behavior. --- bigframes/dataframe.py | 21 ++++++++++++++++++--- tests/system/small/test_anywidget.py | 15 +++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f016fddd83..ae284fef0e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,11 +783,26 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # anywdiget mode uses the same display logic as the "deferred" mode - # for faster execution - if opts.repr_mode in ("deferred", "anywidget"): + + # Only deferred mode shows dry run + if opts.repr_mode in ("deferred"): return formatter.repr_query_job(self._compute_dry_run()) + # Anywidget mode uses interative display + if opts.repr_mode == "anywidget": + # Try to display with anywidget, fall back to deferred if not in IPython + try: + from IPython.display import display as ipython_display + + from bigframes import display + + widget = display.TableWidget(self.copy()) + ipython_display(widget) + return "" # Return empty string since we used display() + except (AttributeError, ValueError, ImportError): + # Not in IPython environment, fall back to deferred mode + return formatter.repr_query_job(self._compute_dry_run()) + # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 8944ee5365..ad16888b44 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -455,6 +455,21 @@ def test_widget_creation_should_load_css_for_rendering(table_widget): assert ".bigframes-widget .footer" in css_content +def test_sql_anywidget_mode(session: bf.Session): + """ + Test that a SQL query runs in anywidget mode. + """ + sql = "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_current` LIMIT 5" + + with bf.option_context("display.repr_mode", "anywidget"): + df = session.read_gbq(sql) + # In a real environment, this would display a widget. + # For testing, we just want to make sure we're in the anywidget code path. + # The `_repr_html_` method in anywidget mode will return an empty string + # and display the widget via IPython's display mechanism. + assert df._repr_html_() == "" + + def test_widget_row_count_should_be_immutable_after_creation( paginated_bf_df: bf.dataframe.DataFrame, ): From 074d4c20f172c1ac2f0ed76bee38bb7d02b5acf3 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Sat, 4 Oct 2025 08:44:21 +0000 Subject: [PATCH 03/37] fix: resolve double printing issue in anywidget mode --- bigframes/dataframe.py | 5 +- notebooks/dataframes/anywidget_mode.ipynb | 38 ++++++-- tests/system/small/test_anywidget.py | 105 ++-------------------- 3 files changed, 41 insertions(+), 107 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ae284fef0e..0eb53ddc03 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -792,13 +792,10 @@ def __repr__(self) -> str: if opts.repr_mode == "anywidget": # Try to display with anywidget, fall back to deferred if not in IPython try: - from IPython.display import display as ipython_display - from bigframes import display widget = display.TableWidget(self.copy()) - ipython_display(widget) - return "" # Return empty string since we used display() + return widget._repr_html_() # Return widget's HTML representation except (AttributeError, ValueError, ImportError): # Not in IPython environment, fall back to deferred mode return formatter.repr_query_job(self._compute_dry_run()) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index c2af915721..88f9658009 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -73,11 +73,25 @@ "id": "f289d250", "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "071c0a905297406ba6c990cbbb8fc28d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5552452, table_html=' 0 assert ".bigframes-widget .footer" in css_content -def test_sql_anywidget_mode(session: bf.Session): +@mock.patch("bigframes.display.TableWidget") +def test_sql_anywidget_mode(mock_table_widget, session: bf.Session): """ Test that a SQL query runs in anywidget mode. """ @@ -465,88 +456,8 @@ def test_sql_anywidget_mode(session: bf.Session): df = session.read_gbq(sql) # In a real environment, this would display a widget. # For testing, we just want to make sure we're in the anywidget code path. - # The `_repr_html_` method in anywidget mode will return an empty string - # and display the widget via IPython's display mechanism. - assert df._repr_html_() == "" - - -def test_widget_row_count_should_be_immutable_after_creation( - paginated_bf_df: bf.dataframe.DataFrame, -): - """ - Given a widget created with a specific configuration when global display - options are changed later, the widget's original row_count should remain - unchanged. - """ - from bigframes.display import TableWidget - - # Use a context manager to ensure the option is reset - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = TableWidget(paginated_bf_df) - initial_row_count = widget.row_count - - # Change a global option that could influence row count - bf.options.display.max_rows = 10 - - # Verify the row count remains immutable. - assert widget.row_count == initial_row_count - - -class FaultyIterator: - def __iter__(self): - return self - - def __next__(self): - raise ValueError("Simulated read error") - - -def test_widget_should_fallback_to_zero_rows_with_invalid_total_rows( - paginated_bf_df: bf.dataframe.DataFrame, - monkeypatch: pytest.MonkeyPatch, -): - """ - Given an internal component fails to return valid execution data, - when the TableWidget is created, its error_message should be set and displayed. - """ - # Patch the executor's 'execute' method to simulate an error. - monkeypatch.setattr( - "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", - lambda self, *args, **kwargs: mock_execute_result_with_params( - self, paginated_bf_df._block.expr.schema, None, [], *args, **kwargs - ), - ) - - # Create the TableWidget under the error condition. - with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display import TableWidget - - # The widget should handle the faulty data from the mock without crashing. - widget = TableWidget(paginated_bf_df) - - # The widget should have an error message and display it in the HTML. - assert widget.row_count == 0 - assert widget._error_message is not None - assert "Could not determine total row count" in widget._error_message - assert widget._error_message in widget.table_html - - -def test_widget_row_count_reflects_actual_data_available( - paginated_bf_df: bf.dataframe.DataFrame, -): - """ - Test that widget row_count reflects the actual data available, - regardless of theoretical limits. - """ - from bigframes.display import TableWidget - - # Set up display options that define a page size. - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = TableWidget(paginated_bf_df) - - # The widget should report the total rows in the DataFrame, - # not limited by page_size (which only affects pagination) - assert widget.row_count == EXPECTED_ROW_COUNT - assert widget.page_size == 2 # Respects the display option + df._repr_html_() + mock_table_widget.assert_called_once() # TODO(shuowei): Add tests for custom index and multiindex From 982ea9781af00c88b19b84bc16e0de3a78dea5ef Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 7 Oct 2025 05:42:54 +0000 Subject: [PATCH 04/37] feat: Add test case for STRUCT column in anywidget Adds a test case to verify that a DataFrame with a STRUCT column is correctly displayed in anywidget mode. This test confirms that displaying a STRUCT column does not raise an exception that would trigger the fallback to the deferred representation. It mocks `IPython.display.display` to capture the `TableWidget` instance and asserts that the rendered HTML contains the expected string representation of the STRUCT data. --- tests/system/small/test_anywidget.py | 41 ++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 4f82f7d81d..15e902ee16 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -460,6 +460,47 @@ def test_sql_anywidget_mode(mock_table_widget, session: bf.Session): mock_table_widget.assert_called_once() +@mock.patch("IPython.display.display") +def test_struct_column_anywidget_mode(mock_display, session: bf.Session): + """ + Test that a DataFrame with a STRUCT column is displayed in anywidget mode + and does not fall back to the deferred representation. This confirms that + anywidget can handle complex types without raising an exception that would + trigger the fallback mechanism. + """ + pandas_df = pd.DataFrame( + { + "a": [1], + "b": [{"c": 2, "d": 3}], + } + ) + bf_df = session.read_pandas(pandas_df) + + with bf.option_context("display.repr_mode", "anywidget"): + with mock.patch( + "bigframes.dataframe.formatter.repr_query_job" + ) as mock_repr_query_job: + # Trigger the display logic. + result = bf_df._repr_html_() + + # Assert that we did NOT fall back to the deferred representation. + mock_repr_query_job.assert_not_called() + + # Assert that display was called with a TableWidget + mock_display.assert_called_once() + widget = mock_display.call_args[0][0] + from bigframes.display import TableWidget + + assert isinstance(widget, TableWidget) + + # Assert that the widget's html contains the struct + html = widget.table_html + assert "{'c': 2, 'd': 3}" in html + + # Assert that _repr_html_ returns an empty string + assert result == "" + + # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. From a9116c71f964cf5c8cec16b3249ded6faffb30ec Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 9 Oct 2025 08:25:28 +0000 Subject: [PATCH 05/37] fix presubmit --- bigframes/display/anywidget.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index a0b4f809d8..15a022a1f5 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -218,16 +218,14 @@ def _set_table_html(self) -> None: start = self.page * self.page_size end = start + self.page_size - # fetch more data if the requested page is outside our cache - cached_data = self._cached_data - while len(cached_data) < end and not self._all_data_loaded: - if self._get_next_batch(): - cached_data = self._cached_data - else: - break - - # Get the data for the current page - page_data = cached_data.iloc[start:end] + # fetch more data if the requested page is outside our cache + cached_data = self._cached_data + while len(cached_data) < end and not self._all_data_loaded: + if self._get_next_batch(): + cached_data = self._cached_data + else: + break + page_data = cached_data.iloc[start:end] # Generate HTML table self.table_html = bigframes.display.html.render_html( @@ -250,8 +248,5 @@ def _page_size_changed(self, _change: Dict[str, Any]) -> None: # Reset the page to 0 when page size changes to avoid invalid page states self.page = 0 - # Reset batches to use new page size for future data fetching - self._reset_batches_for_new_page_size() - # Update the table display self._set_table_html() From f0992c693221965fe57b8ab0edba322a4ac0b303 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 19:44:37 +0000 Subject: [PATCH 06/37] Revert accidental changes to test_function.py --- tests/system/large/blob/test_function.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 2124234649..7963fabd0b 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -768,7 +768,6 @@ def test_blob_transcribe( ) .to_pandas() ) - print(actual) # check relative length expected_text = "Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress" From 3aefdbfe73e3ec6bfbc611c185aafc94de8e1538 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 19:46:37 +0000 Subject: [PATCH 07/37] revert accidental change to blob.py --- bigframes/operations/blob.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 038cc1d891..1f6b75a8f5 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -974,23 +974,21 @@ def audio_transcribe( prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio." - # Use bbq.ai.generate() to transcribe audio + # Convert the audio series to the runtime representation required by the model. + audio_runtime = audio_series.blob._get_runtime("R", with_metadata=True) + transcribed_results = bbq.ai.generate( - prompt=(prompt_text, audio_series), + prompt=(prompt_text, audio_runtime), connection_id=connection, endpoint=model_name, - request_type="unspecified", + model_params={"generationConfig": {"temperature": 0.0}}, ) - transcribed_content_series = transcribed_results.struct.field("result").rename( - "transcribed_content" - ) transcribed_content_series = transcribed_results.struct.field("result").rename( "transcribed_content" ) if verbose: - transcribed_status_series = transcribed_results.struct.field("status") transcribed_status_series = transcribed_results.struct.field("status") results_df = bpd.DataFrame( { From 7d4cfdfb6d677ad31245dfd4dda56ef8ff9a3fe6 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 19:54:54 +0000 Subject: [PATCH 08/37] change return type --- bigframes/dataframe.py | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0eb53ddc03..0259e94132 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -23,7 +23,6 @@ import re import sys import textwrap -import traceback import typing from typing import ( Any, @@ -788,18 +787,6 @@ def __repr__(self) -> str: if opts.repr_mode in ("deferred"): return formatter.repr_query_job(self._compute_dry_run()) - # Anywidget mode uses interative display - if opts.repr_mode == "anywidget": - # Try to display with anywidget, fall back to deferred if not in IPython - try: - from bigframes import display - - widget = display.TableWidget(self.copy()) - return widget._repr_html_() # Return widget's HTML representation - except (AttributeError, ValueError, ImportError): - # Not in IPython environment, fall back to deferred mode - return formatter.repr_query_job(self._compute_dry_run()) - # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? @@ -863,27 +850,27 @@ def _repr_html_(self) -> str: if opts.repr_mode == "anywidget": try: + import anywidget # noqa: F401 from IPython.display import display as ipython_display + import traitlets # noqa: F401 from bigframes import display - - # Always create a new widget instance for each display call - # This ensures that each cell gets its own widget and prevents - # unintended sharing between cells - widget = display.TableWidget(df.copy()) - - ipython_display(widget) - return "" # Return empty string since we used display() - - except (AttributeError, ValueError, ImportError): - # Fallback if anywidget is not available + except ImportError: warnings.warn( - "Anywidget mode is not available. " + "anywidget or its dependencies are not installed. " "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. " - f"Falling back to deferred mode. Error: {traceback.format_exc()}" + "Falling back to deferred mode." ) return formatter.repr_query_job(self._compute_dry_run()) + # Always create a new widget instance for each display call + # This ensures that each cell gets its own widget and prevents + # unintended sharing between cells + widget = display.TableWidget(df.copy()) + + ipython_display(widget) + return "" # Return empty string since we used display() + # Continue with regular HTML rendering for non-anywidget modes # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the From a951810f11b3872d6b5868e37b5a56de08ff9655 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 20 Oct 2025 08:26:16 +0000 Subject: [PATCH 09/37] add todo and revert change --- bigframes/dataframe.py | 65 +++++++++++++++------------------- bigframes/display/anywidget.py | 25 ++++++++----- 2 files changed, 46 insertions(+), 44 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0259e94132..b7d1268b61 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -23,6 +23,7 @@ import re import sys import textwrap +import traceback import typing from typing import ( Any, @@ -782,9 +783,9 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - - # Only deferred mode shows dry run - if opts.repr_mode in ("deferred"): + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution + if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe @@ -850,27 +851,27 @@ def _repr_html_(self) -> str: if opts.repr_mode == "anywidget": try: - import anywidget # noqa: F401 from IPython.display import display as ipython_display - import traitlets # noqa: F401 from bigframes import display - except ImportError: + + # Always create a new widget instance for each display call + # This ensures that each cell gets its own widget and prevents + # unintended sharing between cells + widget = display.TableWidget(df.copy()) + + ipython_display(widget) + return "" # Return empty string since we used display() + + except (AttributeError, ValueError, ImportError): + # Fallback if anywidget is not available warnings.warn( - "anywidget or its dependencies are not installed. " + "Anywidget mode is not available. " "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. " - "Falling back to deferred mode." + f"Falling back to deferred mode. Error: {traceback.format_exc()}" ) return formatter.repr_query_job(self._compute_dry_run()) - # Always create a new widget instance for each display call - # This ensures that each cell gets its own widget and prevents - # unintended sharing between cells - widget = display.TableWidget(df.copy()) - - ipython_display(widget) - return "" # Return empty string since we used display() - # Continue with regular HTML rendering for non-anywidget modes # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the @@ -2568,33 +2569,25 @@ def sort_index( ) -> None: ... + @validations.requires_index def sort_index( self, *, - axis: Union[int, str] = 0, ascending: bool = True, inplace: bool = False, na_position: Literal["first", "last"] = "last", ) -> Optional[DataFrame]: - if utils.get_axis_number(axis) == 0: - if na_position not in ["first", "last"]: - raise ValueError("Param na_position must be one of 'first' or 'last'") - na_last = na_position == "last" - index_columns = self._block.index_columns - ordering = [ - order.ascending_over(column, na_last) - if ascending - else order.descending_over(column, na_last) - for column in index_columns - ] - block = self._block.order_by(ordering) - else: # axis=1 - _, indexer = self.columns.sort_values( - return_indexer=True, ascending=ascending, na_position=na_position # type: ignore - ) - block = self._block.select_columns( - [self._block.value_columns[i] for i in indexer] - ) + if na_position not in ["first", "last"]: + raise ValueError("Param na_position must be one of 'first' or 'last'") + na_last = na_position == "last" + index_columns = self._block.index_columns + ordering = [ + order.ascending_over(column, na_last) + if ascending + else order.descending_over(column, na_last) + for column in index_columns + ] + block = self._block.order_by(ordering) if inplace: self._set_block(block) return None diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 15a022a1f5..1ed6eeb8a5 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -209,6 +209,15 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" + # TODO (shuowei): BigFrames Series with db_dtypes.JSONArrowType column + # fails to convert to pandas DataFrame in anywidget environment due to + # missing handling in to_pandas_batches(). b/453561268 + # For empty dataframe, render empty table with headers. + if self.row_count == 0: + page_data = self._cached_data + else: + start = self.page * self.page_size + end = start + self.page_size if self._error_message: self.table_html = ( f"
{self._error_message}
" @@ -218,14 +227,14 @@ def _set_table_html(self) -> None: start = self.page * self.page_size end = start + self.page_size - # fetch more data if the requested page is outside our cache - cached_data = self._cached_data - while len(cached_data) < end and not self._all_data_loaded: - if self._get_next_batch(): - cached_data = self._cached_data - else: - break - page_data = cached_data.iloc[start:end] + # fetch more data if the requested page is outside our cache + cached_data = self._cached_data + while len(cached_data) < end and not self._all_data_loaded: + if self._get_next_batch(): + cached_data = self._cached_data + else: + break + page_data = cached_data.iloc[start:end] # Generate HTML table self.table_html = bigframes.display.html.render_html( From 89521d2b108492f7b7fed2c29a00b729228a6d1e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 20 Oct 2025 08:28:56 +0000 Subject: [PATCH 10/37] Revert "add todo and revert change" This reverts commit 153e1d203c273d6755623b3db30bd2256a240cc1. --- bigframes/dataframe.py | 65 +++++++++++++++++++--------------- bigframes/display/anywidget.py | 3 -- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b7d1268b61..0259e94132 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -23,7 +23,6 @@ import re import sys import textwrap -import traceback import typing from typing import ( Any, @@ -783,9 +782,9 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # anywdiget mode uses the same display logic as the "deferred" mode - # for faster execution - if opts.repr_mode in ("deferred", "anywidget"): + + # Only deferred mode shows dry run + if opts.repr_mode in ("deferred"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe @@ -851,27 +850,27 @@ def _repr_html_(self) -> str: if opts.repr_mode == "anywidget": try: + import anywidget # noqa: F401 from IPython.display import display as ipython_display + import traitlets # noqa: F401 from bigframes import display - - # Always create a new widget instance for each display call - # This ensures that each cell gets its own widget and prevents - # unintended sharing between cells - widget = display.TableWidget(df.copy()) - - ipython_display(widget) - return "" # Return empty string since we used display() - - except (AttributeError, ValueError, ImportError): - # Fallback if anywidget is not available + except ImportError: warnings.warn( - "Anywidget mode is not available. " + "anywidget or its dependencies are not installed. " "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. " - f"Falling back to deferred mode. Error: {traceback.format_exc()}" + "Falling back to deferred mode." ) return formatter.repr_query_job(self._compute_dry_run()) + # Always create a new widget instance for each display call + # This ensures that each cell gets its own widget and prevents + # unintended sharing between cells + widget = display.TableWidget(df.copy()) + + ipython_display(widget) + return "" # Return empty string since we used display() + # Continue with regular HTML rendering for non-anywidget modes # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the @@ -2569,25 +2568,33 @@ def sort_index( ) -> None: ... - @validations.requires_index def sort_index( self, *, + axis: Union[int, str] = 0, ascending: bool = True, inplace: bool = False, na_position: Literal["first", "last"] = "last", ) -> Optional[DataFrame]: - if na_position not in ["first", "last"]: - raise ValueError("Param na_position must be one of 'first' or 'last'") - na_last = na_position == "last" - index_columns = self._block.index_columns - ordering = [ - order.ascending_over(column, na_last) - if ascending - else order.descending_over(column, na_last) - for column in index_columns - ] - block = self._block.order_by(ordering) + if utils.get_axis_number(axis) == 0: + if na_position not in ["first", "last"]: + raise ValueError("Param na_position must be one of 'first' or 'last'") + na_last = na_position == "last" + index_columns = self._block.index_columns + ordering = [ + order.ascending_over(column, na_last) + if ascending + else order.descending_over(column, na_last) + for column in index_columns + ] + block = self._block.order_by(ordering) + else: # axis=1 + _, indexer = self.columns.sort_values( + return_indexer=True, ascending=ascending, na_position=na_position # type: ignore + ) + block = self._block.select_columns( + [self._block.value_columns[i] for i in indexer] + ) if inplace: self._set_block(block) return None diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 1ed6eeb8a5..cf5d4e6310 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -209,9 +209,6 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" - # TODO (shuowei): BigFrames Series with db_dtypes.JSONArrowType column - # fails to convert to pandas DataFrame in anywidget environment due to - # missing handling in to_pandas_batches(). b/453561268 # For empty dataframe, render empty table with headers. if self.row_count == 0: page_data = self._cached_data From 1c155d04b2fd9d0ec286e4458cb5ae758e201c1a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 20 Oct 2025 17:12:13 +0000 Subject: [PATCH 11/37] Add todo --- bigframes/display/anywidget.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index cf5d4e6310..1ed6eeb8a5 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -209,6 +209,9 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" + # TODO (shuowei): BigFrames Series with db_dtypes.JSONArrowType column + # fails to convert to pandas DataFrame in anywidget environment due to + # missing handling in to_pandas_batches(). b/453561268 # For empty dataframe, render empty table with headers. if self.row_count == 0: page_data = self._cached_data From 86cb692d9ad30ca1cf36f3490ce5fb4c5ac8a0ec Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 01:54:37 +0000 Subject: [PATCH 12/37] Fix: Handle JSON dtype in anywidget display This commit fixes an AttributeError that occurred when displaying a DataFrame with a JSON column in anywidget mode. The dtype check was incorrect and has been updated. Additionally, the SQL compilation for casting JSON to string has been corrected to use TO_JSON_STRING. --- .../ibis_compiler/scalar_op_registry.py | 2 +- bigframes/display/anywidget.py | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index e983fc7e21..7b17aac61a 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1036,7 +1036,7 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): if to_type == ibis_dtypes.bool: return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) if to_type == ibis_dtypes.string: - return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) + return to_json_string(x) # TODO: either inline this function, or push rest of this op into the function return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 1ed6eeb8a5..ff5a51f312 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -74,7 +74,21 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget." ) - self._dataframe = dataframe + super().__init__() + # Workaround for Arrow bug https://github.com/apache/arrow/issues/45262 + # JSON columns are not supported in `to_pandas_batches` and will be converted to string. + json_cols = [ + col + for col, dtype in dataframe.dtypes.items() + if dtype == bigframes.dtypes.JSON_DTYPE + ] + if json_cols: + df_copy = dataframe.copy() + for col in json_cols: + df_copy[str(col)] = df_copy[str(col)].astype("string") + self._dataframe = df_copy + else: + self._dataframe = dataframe super().__init__() @@ -209,9 +223,6 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" - # TODO (shuowei): BigFrames Series with db_dtypes.JSONArrowType column - # fails to convert to pandas DataFrame in anywidget environment due to - # missing handling in to_pandas_batches(). b/453561268 # For empty dataframe, render empty table with headers. if self.row_count == 0: page_data = self._cached_data From 81013c6133fe3beeaec2dce300b03b2165ca2d79 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 02:40:03 +0000 Subject: [PATCH 13/37] revert a change --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0259e94132..41bc4db03c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -784,7 +784,7 @@ def __repr__(self) -> str: max_results = opts.max_rows # Only deferred mode shows dry run - if opts.repr_mode in ("deferred"): + if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe From 6ea72810f26c13e76b0a92ed1333ba1b91d6edbf Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 02:41:42 +0000 Subject: [PATCH 14/37] revert a change --- bigframes/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 41bc4db03c..fc60e47f7a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,7 +783,8 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # Only deferred mode shows dry run + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) From 63b7918bba81abdef65a13cbd486b5f1bd5b364b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 02:49:38 +0000 Subject: [PATCH 15/37] Revert: Restore bigframes/dataframe.py to state from 42da847 --- bigframes/dataframe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index fc60e47f7a..0259e94132 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,9 +783,8 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # anywdiget mode uses the same display logic as the "deferred" mode - # for faster execution - if opts.repr_mode in ("deferred", "anywidget"): + # Only deferred mode shows dry run + if opts.repr_mode in ("deferred"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe From 4aa98797c42a93da2c3d1fb89d4293886d01d120 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 02:57:02 +0000 Subject: [PATCH 16/37] remove anywidget from early return, allow execution proceeds to _repr_html_() --- bigframes/dataframe.py | 15 +++++++++++++++ bigframes/operations/output_schemas.py | 5 +++++ 2 files changed, 20 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0259e94132..5ecc123417 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -863,6 +863,21 @@ def _repr_html_(self) -> str: ) return formatter.repr_query_job(self._compute_dry_run()) + # The anywidget frontend doesn't support the db_dtypes JSON type, so + # convert to strings for display. + json_cols = [ + series_name + for series_name, series in df.items() + if bigframes.dtypes.contains_db_dtypes_json_dtype(series.dtype) + ] + if json_cols: + warnings.warn( + "Converting JSON columns to strings for display. " + "This is temporary and will be removed when the frontend supports JSON types." + ) + for col in json_cols: + df[col] = df[col]._apply_unary_op(ops.json_ops.ToJSONString()) + # Always create a new widget instance for each display call # This ensures that each cell gets its own widget and prevents # unintended sharing between cells diff --git a/bigframes/operations/output_schemas.py b/bigframes/operations/output_schemas.py index ff9c9883dc..2a72d4f48f 100644 --- a/bigframes/operations/output_schemas.py +++ b/bigframes/operations/output_schemas.py @@ -14,6 +14,8 @@ import pyarrow as pa +from bigframes import dtypes + def parse_sql_type(sql: str) -> pa.DataType: """ @@ -43,6 +45,9 @@ def parse_sql_type(sql: str) -> pa.DataType: if sql.upper() == "BOOL": return pa.bool_() + if sql.upper() == "JSON": + return dtypes.JSON_ARROW_TYPE + if sql.upper().startswith("ARRAY<") and sql.endswith(">"): inner_type = sql[len("ARRAY<") : -1] return pa.list_(parse_sql_type(inner_type)) From 62d8608418bdd30e931a6ccd72e24be9ce591de5 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 03:13:23 +0000 Subject: [PATCH 17/37] remove unnecessary changes --- bigframes/dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5ecc123417..0dc8bc3d5f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,8 +783,9 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # Only deferred mode shows dry run - if opts.repr_mode in ("deferred"): + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution + if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe From 24d766d18fdd7fc8275bed76000219486bdeb828 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 03:21:04 +0000 Subject: [PATCH 18/37] remove redundant code change --- bigframes/dataframe.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0dc8bc3d5f..4fe259639e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -23,6 +23,7 @@ import re import sys import textwrap +import traceback import typing from typing import ( Any, @@ -856,11 +857,12 @@ def _repr_html_(self) -> str: import traitlets # noqa: F401 from bigframes import display - except ImportError: + except (AttributeError, ValueError, ImportError): + # Fallback if anywidget is not available warnings.warn( - "anywidget or its dependencies are not installed. " + "Anywidget mode is not available. " "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. " - "Falling back to deferred mode." + f"Falling back to deferred mode. Error: {traceback.format_exc()}" ) return formatter.repr_query_job(self._compute_dry_run()) From 9239f20cdff25191082bdd789eccdc4ff6d6b584 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 03:54:11 +0000 Subject: [PATCH 19/37] code style change --- .../ibis_compiler/scalar_op_registry.py | 2 +- bigframes/dataframe.py | 48 +++--- notebooks/dataframes/anywidget_mode.ipynb | 160 ++++++++++++++---- 3 files changed, 153 insertions(+), 57 deletions(-) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 7b17aac61a..74314cd882 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -30,7 +30,7 @@ from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS import bigframes.core.compile.ibis_compiler.default_ordering from bigframes.core.compile.ibis_compiler.scalar_op_compiler import ( - scalar_op_compiler, # TODO(tswast): avoid import of variables + scalar_op_compiler, # TODO(b/428238610): avoid import of variables ) import bigframes.core.compile.ibis_types import bigframes.operations as ops diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 4fe259639e..38500b8fb3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -784,7 +784,7 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # anywdiget mode uses the same display logic as the "deferred" mode + # anywidget mode uses the same display logic as the "deferred" mode # for faster execution if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) @@ -857,6 +857,29 @@ def _repr_html_(self) -> str: import traitlets # noqa: F401 from bigframes import display + + # The anywidget frontend doesn't support the db_dtypes JSON type, so + # convert to strings for display. + json_cols = [ + series_name + for series_name, series in df.items() + if bigframes.dtypes.contains_db_dtypes_json_dtype(series.dtype) + ] + if json_cols: + warnings.warn( + "Converting JSON columns to strings for display. " + "This is temporary and will be removed when the frontend supports JSON types." + ) + for col in json_cols: + df[col] = df[col]._apply_unary_op(ops.json_ops.ToJSONString()) + + # Always create a new widget instance for each display call + # This ensures that each cell gets its own widget and prevents + # unintended sharing between cells + widget = display.TableWidget(df.copy()) + + ipython_display(widget) + return "" # Return empty string since we used display() except (AttributeError, ValueError, ImportError): # Fallback if anywidget is not available warnings.warn( @@ -866,29 +889,6 @@ def _repr_html_(self) -> str: ) return formatter.repr_query_job(self._compute_dry_run()) - # The anywidget frontend doesn't support the db_dtypes JSON type, so - # convert to strings for display. - json_cols = [ - series_name - for series_name, series in df.items() - if bigframes.dtypes.contains_db_dtypes_json_dtype(series.dtype) - ] - if json_cols: - warnings.warn( - "Converting JSON columns to strings for display. " - "This is temporary and will be removed when the frontend supports JSON types." - ) - for col in json_cols: - df[col] = df[col]._apply_unary_op(ops.json_ops.ToJSONString()) - - # Always create a new widget instance for each display call - # This ensures that each cell gets its own widget and prevents - # unintended sharing between cells - widget = display.TableWidget(df.copy()) - - ipython_display(widget) - return "" # Return empty string since we used display() - # Continue with regular HTML rendering for non-anywidget modes # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 88f9658009..903d88b210 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -73,25 +73,11 @@ "id": "f289d250", "metadata": {}, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "071c0a905297406ba6c990cbbb8fc28d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "TableWidget(page_size=10, row_count=5552452, table_html='
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dataframe.py:869: UserWarning: Converting JSON columns to strings for display. This is temporary and will be removed when the frontend supports JSON types.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 0 Bytes in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ab607cc7263f4a159ecfe63682c5e651", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5, table_html='
(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\\n\n", + " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\\n\n", + " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\\n\n", + " *\\n\n", + "FROM `bigquery-public-data.labeled_patents.extracted_data`\\n\n", + "LIMIT 5;\\n\n", + "\"\"\")" + ] } ], "metadata": { "kernelspec": { + "display_name": "3.10.18", "display_name": "3.10.18", "language": "python", "name": "python3" @@ -368,6 +463,7 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" + "version": "3.10.18" } }, "nbformat": 4, From 48d6c665c072237bc61aa7d705663bfe0aa4ddb8 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 04:07:52 +0000 Subject: [PATCH 20/37] tescase update --- tests/system/small/test_anywidget.py | 68 ++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 15e902ee16..40804e1853 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -24,7 +24,6 @@ # Test constants to avoid change detector tests EXPECTED_ROW_COUNT = 6 EXPECTED_PAGE_SIZE = 2 -EXPECTED_TOTAL_PAGES = 3 @pytest.fixture(scope="module") @@ -112,21 +111,19 @@ def empty_bf_df( return session.read_pandas(empty_pandas_df) -def mock_execute_result_with_params( - self, schema, total_rows_val, arrow_batches_val, *args, **kwargs -): - """ - Mocks an execution result with configurable total_rows and arrow_batches. - """ - from bigframes.session.executor import ExecuteResult +@pytest.fixture(scope="module") +def json_df(session: bf.Session) -> bf.dataframe.DataFrame: + """Create a DataFrame with a JSON column for testing.""" + import bigframes.dtypes - return ExecuteResult( - iter(arrow_batches_val), - schema=schema, - query_job=None, - total_bytes=None, - total_rows=total_rows_val, + pandas_df = pd.DataFrame( + { + "a": [1], + "b": ['{"c": 2, "d": 3}'], + } ) + pandas_df["b"] = pandas_df["b"].astype(bigframes.dtypes.JSON_DTYPE) + return session.read_pandas(pandas_df) def _assert_html_matches_pandas_slice( @@ -438,12 +435,6 @@ def test_setting_page_size_above_max_should_be_clamped(table_widget): # The page size is clamped to the maximum. assert table_widget.page_size == expected_clamped_size - """ - Test that the widget's CSS is loaded correctly. - """ - css_content = table_widget._css - assert ".bigframes-widget .footer" in css_content - @mock.patch("bigframes.display.TableWidget") def test_sql_anywidget_mode(mock_table_widget, session: bf.Session): @@ -501,6 +492,43 @@ def test_struct_column_anywidget_mode(mock_display, session: bf.Session): assert result == "" +def test_widget_creation_should_load_css_for_rendering(table_widget): + """ + Test that the widget's CSS is loaded correctly. + """ + css_content = table_widget._css + assert ".bigframes-widget .footer" in css_content + + +@mock.patch("IPython.display.display") +def test_json_column_anywidget_mode(mock_display, json_df: bf.dataframe.DataFrame): + """ + Test that a DataFrame with a JSON column is displayed in anywidget mode + by converting JSON to string, and does not fall back to deferred representation. + """ + with bf.option_context("display.repr_mode", "anywidget"): + with mock.patch( + "bigframes.dataframe.formatter.repr_query_job" + ) as mock_repr_query_job: + result = json_df._repr_html_() + + # Assert no fallback + mock_repr_query_job.assert_not_called() + + # Assert TableWidget was created and displayed + mock_display.assert_called_once() + widget = mock_display.call_args[0][0] + from bigframes.display import TableWidget + + assert isinstance(widget, TableWidget) + + # Assert JSON was converted to string in the HTML + html = widget.table_html + assert "{"c":2,"d":3}" in html + + assert result == "" + + # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. From 4cb8cd22a6c93342d599f2e976ec05aa92b42302 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 04:15:35 +0000 Subject: [PATCH 21/37] revert a change --- bigframes/core/compile/ibis_compiler/scalar_op_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 74314cd882..7b17aac61a 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -30,7 +30,7 @@ from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS import bigframes.core.compile.ibis_compiler.default_ordering from bigframes.core.compile.ibis_compiler.scalar_op_compiler import ( - scalar_op_compiler, # TODO(b/428238610): avoid import of variables + scalar_op_compiler, # TODO(tswast): avoid import of variables ) import bigframes.core.compile.ibis_types import bigframes.operations as ops From 75a6d68e3e4c4c6474f1aaef2e257b6a0e0d1cf3 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 04:24:10 +0000 Subject: [PATCH 22/37] final touch of notebook --- notebooks/dataframes/anywidget_mode.ipynb | 28 +++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 903d88b210..23be36701d 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -142,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "25b38c1408434091865f4bf9525dd069", + "model_id": "f0ed74d739b64a56a6e3750968b155e1", "version_major": 2, "version_minor": 0 }, @@ -217,7 +217,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "cb4f246802a1407cb966321d8724ea27", + "model_id": "fd00566103744c189a52033df9c9db7a", "version_major": 2, "version_minor": 0 }, @@ -330,7 +330,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5b48c05254224c4dbce56f2793d6b661", + "model_id": "2233934e95b84a87b01b9352ca36346d", "version_major": 2, "version_minor": 0 }, @@ -369,7 +369,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 14 seconds of slot time.\n", + " Query processed 85.9 kB in 11 seconds of slot time.\n", " " ], "text/plain": [ @@ -408,7 +408,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ab607cc7263f4a159ecfe63682c5e651", + "model_id": "3e3e09d7adee4bcaa5b3540603c2418a", "version_major": 2, "version_minor": 0 }, @@ -432,15 +432,15 @@ } ], "source": [ - "bpd._read_gbq_colab(\"\"\"\\n\n", - "SELECT\\n\n", - " AI.GENERATE(\\n\n", - " prompt=>(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\\n\n", - " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\\n\n", - " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\\n\n", - " *\\n\n", - "FROM `bigquery-public-data.labeled_patents.extracted_data`\\n\n", - "LIMIT 5;\\n\n", + "bpd._read_gbq_colab(\"\"\"\n", + " SELECT\n", + " AI.GENERATE(\n", + " prompt=>(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\n", + " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\n", + " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\n", + " *\n", + " FROM `bigquery-public-data.labeled_patents.extracted_data`\n", + " LIMIT 5;\n", "\"\"\")" ] } From 8dc2171ee13b43b7d9a776fae960f0c27e3b03dd Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 17:21:16 +0000 Subject: [PATCH 23/37] fix presumbit error --- bigframes/operations/output_schemas.py | 5 ---- notebooks/dataframes/anywidget_mode.ipynb | 34 +++++++++++------------ tests/system/small/test_anywidget.py | 22 +++++++-------- 3 files changed, 27 insertions(+), 34 deletions(-) diff --git a/bigframes/operations/output_schemas.py b/bigframes/operations/output_schemas.py index 2a72d4f48f..ff9c9883dc 100644 --- a/bigframes/operations/output_schemas.py +++ b/bigframes/operations/output_schemas.py @@ -14,8 +14,6 @@ import pyarrow as pa -from bigframes import dtypes - def parse_sql_type(sql: str) -> pa.DataType: """ @@ -45,9 +43,6 @@ def parse_sql_type(sql: str) -> pa.DataType: if sql.upper() == "BOOL": return pa.bool_() - if sql.upper() == "JSON": - return dtypes.JSON_ARROW_TYPE - if sql.upper().startswith("ARRAY<") and sql.endswith(">"): inner_type = sql[len("ARRAY<") : -1] return pa.list_(parse_sql_type(inner_type)) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 23be36701d..154afea7e1 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "d10bfca4", "metadata": {}, "outputs": [], @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "ca22f059", "metadata": {}, "outputs": [], @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "1bc5aaf3", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "f289d250", "metadata": {}, "outputs": [ @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "42bb02ab", "metadata": {}, "outputs": [ @@ -123,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "ce250157", "metadata": {}, "outputs": [ @@ -142,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f0ed74d739b64a56a6e3750968b155e1", + "model_id": "6e46f6d1352043a4baee57fa089f2b0c", "version_major": 2, "version_minor": 0 }, @@ -160,7 +160,7 @@ "Computation deferred. Computation will process 171.4 MB" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -179,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "6920d49b", "metadata": {}, "outputs": [ @@ -217,7 +217,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fd00566103744c189a52033df9c9db7a", + "model_id": "88d370b617b545809eb7bb8e5c66ea0e", "version_major": 2, "version_minor": 0 }, @@ -251,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "12b68f15", "metadata": {}, "outputs": [ @@ -288,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "a9d5d13a", "metadata": {}, "outputs": [ @@ -330,7 +330,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2233934e95b84a87b01b9352ca36346d", + "model_id": "dec19e8788b74219b88bccfc65e3b9c0", "version_major": 2, "version_minor": 0 }, @@ -361,7 +361,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "added-cell-1", "metadata": {}, "outputs": [ @@ -369,7 +369,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 11 seconds of slot time.\n", + " Query processed 85.9 kB in 21 seconds of slot time.\n", " " ], "text/plain": [ @@ -408,7 +408,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3e3e09d7adee4bcaa5b3540603c2418a", + "model_id": "774357b4083c47c8a5e1fd33bb6af188", "version_major": 2, "version_minor": 0 }, @@ -426,7 +426,7 @@ "Computation deferred. Computation will process 0 Bytes" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 40804e1853..890d591de5 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -62,8 +62,7 @@ def table_widget(paginated_bf_df: bf.dataframe.DataFrame): Helper fixture to create a TableWidget instance with a fixed page size. This reduces duplication across tests that use the same widget configuration. """ - - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): # Delay context manager cleanup of `max_rows` until after tests finish. @@ -92,7 +91,7 @@ def small_bf_df( @pytest.fixture def small_widget(small_bf_df): """Helper fixture for tests using a DataFrame smaller than the page size.""" - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 5): yield TableWidget(small_bf_df) @@ -152,10 +151,11 @@ def test_widget_initialization_should_calculate_total_row_count( paginated_bf_df: bf.dataframe.DataFrame, ): """A TableWidget should correctly calculate the total row count on creation.""" - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): widget = TableWidget(paginated_bf_df) + widget = TableWidget(paginated_bf_df) assert widget.row_count == EXPECTED_ROW_COUNT @@ -266,7 +266,7 @@ def test_widget_pagination_should_work_with_custom_page_size( A widget should paginate correctly with a custom page size of 3. """ with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 3): - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget widget = TableWidget(paginated_bf_df) assert widget.page_size == 3 @@ -312,7 +312,7 @@ def test_widget_page_size_should_be_immutable_after_creation( by subsequent changes to global options. """ with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget widget = TableWidget(paginated_bf_df) assert widget.page_size == 2 @@ -331,7 +331,7 @@ def test_widget_page_size_should_be_immutable_after_creation( def test_empty_widget_should_have_zero_row_count(empty_bf_df: bf.dataframe.DataFrame): """Given an empty DataFrame, the widget's row count should be 0.""" with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget widget = TableWidget(empty_bf_df) @@ -341,7 +341,7 @@ def test_empty_widget_should_have_zero_row_count(empty_bf_df: bf.dataframe.DataF def test_empty_widget_should_render_table_headers(empty_bf_df: bf.dataframe.DataFrame): """Given an empty DataFrame, the widget should still render table headers.""" with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget widget = TableWidget(empty_bf_df) @@ -477,10 +477,8 @@ def test_struct_column_anywidget_mode(mock_display, session: bf.Session): # Assert that we did NOT fall back to the deferred representation. mock_repr_query_job.assert_not_called() - # Assert that display was called with a TableWidget - mock_display.assert_called_once() widget = mock_display.call_args[0][0] - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget assert isinstance(widget, TableWidget) @@ -518,7 +516,7 @@ def test_json_column_anywidget_mode(mock_display, json_df: bf.dataframe.DataFram # Assert TableWidget was created and displayed mock_display.assert_called_once() widget = mock_display.call_args[0][0] - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget assert isinstance(widget, TableWidget) From 2adc426e9b97ea49397a1ce19ec30ca304af4410 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 17:43:03 +0000 Subject: [PATCH 24/37] remove invlaid test with anywidget bug fix --- tests/system/small/test_series.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 5ace3f54d8..63c2f6c498 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -4077,7 +4077,6 @@ def test_json_astype_others(data, to_type, errors): pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), - pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), ], ) def test_json_astype_others_raise_error(data, to_type): From faf1bb2d4f7123084a0ba0d09d5414c26fa02a11 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 22:33:00 +0000 Subject: [PATCH 25/37] fix presubmit --- bigframes/series.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/bigframes/series.py b/bigframes/series.py index ad1f091803..e90a360418 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -609,6 +609,15 @@ def astype( if errors not in ["raise", "null"]: raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") dtype = bigframes.dtypes.bigframes_type(dtype) + + # BigQuery doesn't support CAST(json_col AS STRING), but it does support + # TO_JSON_STRING(json_col). + if ( + self.dtype == bigframes.dtypes.JSON_DTYPE + and dtype == bigframes.dtypes.STRING_DTYPE + ): + return self._apply_unary_op(ops.json_ops.ToJSONString()) + return self._apply_unary_op( bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null")) ) From 7a83b804f27dd4216f90c21bf13885958beec924 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 23:42:19 +0000 Subject: [PATCH 26/37] fix polar complier --- bigframes/core/compile/polars/compiler.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index acaf1b8f22..1a55cef63a 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -407,6 +407,19 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: assert isinstance(op, json_ops.JSONDecode) return input.str.json_decode(_DTYPE_MAPPING[op.to_type]) + @compile_op.register(json_ops.ToJSONString) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.str.json_decode(pl.String()) + + @compile_op.register(json_ops.ParseJSON) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.str.json_decode(pl.String()) + + @compile_op.register(json_ops.JSONExtract) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + assert isinstance(op, json_ops.JSONExtract) + return input.str.json_extract(json_path=op.json_path) + @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: return pl.concat_list(*inputs) From 233e857acfeb1d8fdfc47e90391ccc555054272e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 23:49:03 +0000 Subject: [PATCH 27/37] Revert an unnecessary change --- bigframes/operations/output_schemas.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bigframes/operations/output_schemas.py b/bigframes/operations/output_schemas.py index ff9c9883dc..2a72d4f48f 100644 --- a/bigframes/operations/output_schemas.py +++ b/bigframes/operations/output_schemas.py @@ -14,6 +14,8 @@ import pyarrow as pa +from bigframes import dtypes + def parse_sql_type(sql: str) -> pa.DataType: """ @@ -43,6 +45,9 @@ def parse_sql_type(sql: str) -> pa.DataType: if sql.upper() == "BOOL": return pa.bool_() + if sql.upper() == "JSON": + return dtypes.JSON_ARROW_TYPE + if sql.upper().startswith("ARRAY<") and sql.endswith(">"): inner_type = sql[len("ARRAY<") : -1] return pa.list_(parse_sql_type(inner_type)) From 11daddb7ebb22e6544dfd4fb2572b4c7b630ff00 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 22:27:23 +0000 Subject: [PATCH 28/37] apply the workaround to i/O layer --- bigframes/core/compile/polars/compiler.py | 42 +- bigframes/dataframe.py | 17 +- tests/system/small/test_dataframe.py | 6151 +-------------------- 3 files changed, 60 insertions(+), 6150 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 1a55cef63a..681ca37da7 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -45,13 +45,13 @@ polars_installed = True if TYPE_CHECKING: import polars as pl + import pyarrow as pa else: try: import bigframes._importing - # Use import_polars() instead of importing directly so that we check - # the version numbers. pl = bigframes._importing.import_polars() + import pyarrow as pa except Exception: polars_installed = False @@ -409,11 +409,13 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @compile_op.register(json_ops.ToJSONString) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - return input.str.json_decode(pl.String()) + # Convert JSON to string representation + return input.cast(pl.String()) @compile_op.register(json_ops.ParseJSON) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - return input.str.json_decode(pl.String()) + # Parse string as JSON - this should decode, not encode + return input.str.json_decode() @compile_op.register(json_ops.JSONExtract) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @@ -599,9 +601,35 @@ def compile_readlocal(self, node: nodes.ReadLocalNode): scan_item.source_id: scan_item.id.sql for scan_item in node.scan_list.items } - lazy_frame = cast( - pl.DataFrame, pl.from_arrow(node.local_data_source.data) - ).lazy() + + # Workaround for PyArrow bug https://github.com/apache/arrow/issues/45262 + # Convert JSON columns to strings before Polars processing + arrow_data = node.local_data_source.data + schema = arrow_data.schema + + # Check if any columns are JSON type + json_field_indices = [ + i + for i, field in enumerate(schema) + if pa.types.is_extension_type(field.type) + and field.type.extension_name == "google:sqlType:json" + ] + + if json_field_indices: + # Convert JSON columns to string columns + new_arrays = [] + new_fields = [] + for i, field in enumerate(schema): + if i in json_field_indices: + # Cast JSON to string + new_arrays.append(arrow_data.column(i).cast(pa.string())) + new_fields.append(pa.field(field.name, pa.string())) + else: + new_arrays.append(arrow_data.column(i)) + new_fields.append(field) + arrow_data = pa.table(new_arrays, schema=pa.schema(new_fields)) + + lazy_frame = cast(pl.DataFrame, pl.from_arrow(arrow_data)).lazy() lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) if node.offsets_col: lazy_frame = lazy_frame.with_columns( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 38500b8fb3..788a47f38b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1965,7 +1965,22 @@ def _to_pandas_batches( *, allow_large_results: Optional[bool] = None, ) -> blocks.PandasBatches: - return self._block.to_pandas_batches( + # Workaround for PyArrow bug https://github.com/apache/arrow/issues/45262 + # JSON columns are not supported in to_pandas_batches + json_cols = [ + str(col_name) # Cast to string + for col_name, dtype in self.dtypes.items() + if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype) + ] + + df = self + if json_cols: + # Convert JSON columns to strings before materialization + df = df.copy() + for col in json_cols: + df[col] = df[col].astype("string") + + return df._block.to_pandas_batches( page_size=page_size, max_results=max_results, allow_large_results=allow_large_results, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 79f8efd00f..ffd9bc512b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1,6144 +1,11 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +def test_to_pandas_batches_with_json_columns(session): + """Test that JSON columns are properly handled in to_pandas_batches.""" + # Create a DataFrame with JSON column + df = session.read_gbq('SELECT JSON \'{"key": "value"}\' as json_col') -import io -import operator -import sys -import tempfile -import typing -from typing import Dict, List, Tuple + # This should not raise an error + batches = df._to_pandas_batches(page_size=10) + result = next(batches) -import geopandas as gpd # type: ignore -import numpy as np -import pandas as pd -import pandas.testing -import pyarrow as pa # type: ignore -import pytest - -import bigframes -import bigframes._config.display_options as display_options -import bigframes.core.indexes as bf_indexes -import bigframes.dataframe as dataframe -import bigframes.dtypes as dtypes -import bigframes.pandas as bpd -import bigframes.series as series -from bigframes.testing.utils import ( - assert_dfs_equivalent, - assert_pandas_df_equal, - assert_series_equal, - assert_series_equivalent, -) - - -def test_df_construct_copy(scalars_dfs): - columns = ["int64_col", "string_col", "float64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - # Make the mapping from label to col_id non-trivial - bf_df = scalars_df.copy() - bf_df["int64_col"] = bf_df["int64_col"] / 2 - pd_df = scalars_pandas_df.copy() - pd_df["int64_col"] = pd_df["int64_col"] / 2 - - bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas() - - pd_result = pd.DataFrame(pd_df, columns=columns) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_construct_pandas_default(scalars_dfs): - # This should trigger the inlined codepath - columns = [ - "int64_too", - "int64_col", - "float64_col", - "bool_col", - "string_col", - "date_col", - "datetime_col", - "numeric_col", - "float64_col", - "time_col", - "timestamp_col", - ] - _, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns).to_pandas() - pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("write_engine"), - [ - ("bigquery_inline"), - ("bigquery_load"), - ("bigquery_streaming"), - ("bigquery_write"), - ], -) -def test_read_pandas_all_nice_types( - session: bigframes.Session, scalars_pandas_df_index: pd.DataFrame, write_engine -): - bf_result = session.read_pandas( - scalars_pandas_df_index, write_engine=write_engine - ).to_pandas() - pandas.testing.assert_frame_equal(bf_result, scalars_pandas_df_index) - - -def test_df_construct_large_strings(): - data = [["hello", "w" + "o" * 50000 + "rld"]] - bf_result = dataframe.DataFrame(data).to_pandas() - pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow")) - pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -def test_df_construct_pandas_load_job(scalars_dfs_maybe_ordered): - # This should trigger the inlined codepath - columns = [ - "int64_too", - "int64_col", - "float64_col", - "bool_col", - "string_col", - "date_col", - "datetime_col", - "numeric_col", - "float64_col", - "time_col", - "timestamp_col", - "geography_col", - ] - _, scalars_pandas_df = scalars_dfs_maybe_ordered - bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns) - pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) - assert_dfs_equivalent(pd_result, bf_result) - - -def test_df_construct_structs(session): - pd_frame = pd.Series( - [ - {"version": 1, "project": "pandas"}, - {"version": 2, "project": "pandas"}, - {"version": 1, "project": "numpy"}, - ] - ).to_frame() - bf_series = session.read_pandas(pd_frame) - pd.testing.assert_frame_equal( - bf_series.to_pandas(), pd_frame, check_index_type=False, check_dtype=False - ) - - -def test_df_construct_local_concat_pd(scalars_pandas_df_index, session): - pd_df = pd.concat([scalars_pandas_df_index, scalars_pandas_df_index]) - - bf_df = session.read_pandas(pd_df) - - pd.testing.assert_frame_equal( - bf_df.to_pandas(), pd_df, check_index_type=False, check_dtype=False - ) - - -def test_df_construct_pandas_set_dtype(scalars_dfs): - columns = [ - "int64_too", - "int64_col", - "float64_col", - "bool_col", - ] - _, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame( - scalars_pandas_df, columns=columns, dtype="Float64" - ).to_pandas() - pd_result = pd.DataFrame(scalars_pandas_df, columns=columns, dtype="Float64") - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_construct_from_series(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - bf_result = dataframe.DataFrame( - {"a": scalars_df["int64_col"], "b": scalars_df["string_col"]}, - dtype="string[pyarrow]", - ) - pd_result = pd.DataFrame( - {"a": scalars_pandas_df["int64_col"], "b": scalars_pandas_df["string_col"]}, - dtype="string[pyarrow]", - ) - assert_dfs_equivalent(pd_result, bf_result) - - -def test_df_construct_from_dict(): - input_dict = { - "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], - # With a space in column name. We use standardized SQL schema ids to solve the problem that BQ schema doesn't support column names with spaces. b/296751058 - "Max Speed": [380.0, 370.0, 24.0, 26.0], - } - bf_result = dataframe.DataFrame(input_dict).to_pandas() - pd_result = pd.DataFrame(input_dict) - - pandas.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("json_type"), - [ - pytest.param(dtypes.JSON_DTYPE), - pytest.param("json"), - ], -) -def test_df_construct_w_json_dtype(json_type): - data = [ - "1", - "false", - '["a", {"b": 1}, null]', - None, - ] - df = dataframe.DataFrame({"json_col": data}, dtype=json_type) - - assert df["json_col"].dtype == dtypes.JSON_DTYPE - assert df["json_col"][1] == "false" - - -def test_df_construct_inline_respects_location(reset_default_session_and_location): - # Note: This starts a thread-local session. - with bpd.option_context("bigquery.location", "europe-west1"): - df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) - df.to_gbq() - assert df.query_job is not None - table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) - - assert table.location == "europe-west1" - - -def test_df_construct_dtype(): - data = { - "int_col": [1, 2, 3], - "string_col": ["1.1", "2.0", "3.5"], - "float_col": [1.0, 2.0, 3.0], - } - dtype = pd.StringDtype(storage="pyarrow") - bf_result = dataframe.DataFrame(data, dtype=dtype) - pd_result = pd.DataFrame(data, dtype=dtype) - pd_result.index = pd_result.index.astype("Int64") - pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) - - -def test_get_column(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - series = scalars_df[col_name] - bf_result = series.to_pandas() - pd_result = scalars_pandas_df[col_name] - assert_series_equal(bf_result, pd_result) - - -def test_get_column_nonstring(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - series = scalars_df.rename(columns={"int64_col": 123.1})[123.1] - bf_result = series.to_pandas() - pd_result = scalars_pandas_df.rename(columns={"int64_col": 123.1})[123.1] - assert_series_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - "row_slice", - [ - (slice(1, 7, 2)), - (slice(1, 7, None)), - (slice(None, -3, None)), - ], -) -def test_get_rows_with_slice(scalars_dfs, row_slice): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[row_slice].to_pandas() - pd_result = scalars_pandas_df[row_slice] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_hasattr(scalars_dfs): - scalars_df, _ = scalars_dfs - assert hasattr(scalars_df, "int64_col") - assert hasattr(scalars_df, "head") - assert not hasattr(scalars_df, "not_exist") - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_head_with_custom_column_labels( - scalars_df_index, scalars_pandas_df_index, ordered -): - rename_mapping = { - "int64_col": "Integer Column", - "string_col": "言語列", - } - bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) - bf_result = bf_df.to_pandas(ordered=ordered) - pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) - - -def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): - rename_mapping = { - "int64_col": "Integer Column", - "string_col": "言語列", - } - bf_df = scalars_df_index.rename(columns=rename_mapping).tail(3) - bf_result = bf_df.to_pandas() - pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).tail(3) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - ("all",), - ], -) -def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep): - bf_result = scalars_df_index.nlargest(3, ["bool_col", "int64_too"], keep=keep) - pd_result = scalars_pandas_df_index.nlargest( - 3, ["bool_col", "int64_too"], keep=keep - ) - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - ("all",), - ], -) -def test_df_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): - bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep) - pd_result = scalars_pandas_df_index.nsmallest(6, ["bool_col"], keep=keep) - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_get_column_by_attr(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - series = scalars_df.int64_col - bf_result = series.to_pandas() - pd_result = scalars_pandas_df.int64_col - assert_series_equal(bf_result, pd_result) - - -def test_get_columns(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_names = ["bool_col", "float64_col", "int64_col"] - df_subset = scalars_df.get(col_names) - df_pandas = df_subset.to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df[col_names].columns - ) - - -def test_get_columns_default(scalars_dfs): - scalars_df, _ = scalars_dfs - col_names = ["not", "column", "names"] - result = scalars_df.get(col_names, "default_val") - assert result == "default_val" - - -@pytest.mark.parametrize( - ("loc", "column", "value", "allow_duplicates"), - [ - (0, 666, 2, False), - (5, "float64_col", 2.2, True), - (13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True), - pytest.param( - 14, - "test", - 2, - False, - marks=pytest.mark.xfail( - raises=IndexError, - ), - ), - pytest.param( - 12, - "int64_col", - 2, - False, - marks=pytest.mark.xfail( - raises=ValueError, - ), - ), - ], -) -def test_insert(scalars_dfs, loc, column, value, allow_duplicates): - scalars_df, scalars_pandas_df = scalars_dfs - # insert works inplace, so will influence other tests. - # make a copy to avoid inplace changes. - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.insert(loc, column, value, allow_duplicates) - pd_df.insert(loc, column, value, allow_duplicates) - - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) - - -def test_mask_series_cond(scalars_df_index, scalars_pandas_df_index): - cond_bf = scalars_df_index["int64_col"] > 0 - cond_pd = scalars_pandas_df_index["int64_col"] > 0 - - bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] - pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] - bf_result = bf_df.mask(cond_bf, bf_df + 1).to_pandas() - pd_result = pd_df.mask(cond_pd, pd_df + 1) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_mask_callable(scalars_df_index, scalars_pandas_df_index): - def is_positive(x): - return x > 0 - - bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] - pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] - bf_result = bf_df.mask(cond=is_positive, other=lambda x: x + 1).to_pandas() - pd_result = pd_df.mask(cond=is_positive, other=lambda x: x + 1) - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_multi_column(scalars_df_index, scalars_pandas_df_index): - # Test when a dataframe has multi-columns. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - - dataframe_bf.columns = pd.MultiIndex.from_tuples( - [("str1", 1), ("str2", 2)], names=["STR", "INT"] - ) - cond_bf = dataframe_bf["str1"] > 0 - - with pytest.raises(NotImplementedError) as context: - dataframe_bf.where(cond_bf).to_pandas() - assert ( - str(context.value) - == "The dataframe.where() method does not support multi-column." - ) - - -def test_where_series_cond(scalars_df_index, scalars_pandas_df_index): - # Condition is dataframe, other is None (as default). - cond_bf = scalars_df_index["int64_col"] > 0 - cond_pd = scalars_pandas_df_index["int64_col"] > 0 - bf_result = scalars_df_index.where(cond_bf).to_pandas() - pd_result = scalars_pandas_df_index.where(cond_pd) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_series_cond_const_other(scalars_df_index, scalars_pandas_df_index): - # Condition is a series, other is a constant. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - dataframe_bf.columns.name = "test_name" - dataframe_pd.columns.name = "test_name" - - cond_bf = dataframe_bf["int64_col"] > 0 - cond_pd = dataframe_pd["int64_col"] > 0 - other = 0 - - bf_result = dataframe_bf.where(cond_bf, other).to_pandas() - pd_result = dataframe_pd.where(cond_pd, other) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_series_cond_dataframe_other(scalars_df_index, scalars_pandas_df_index): - # Condition is a series, other is a dataframe. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - cond_bf = dataframe_bf["int64_col"] > 0 - cond_pd = dataframe_pd["int64_col"] > 0 - other_bf = -dataframe_bf - other_pd = -dataframe_pd - - bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() - pd_result = dataframe_pd.where(cond_pd, other_pd) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_dataframe_cond(scalars_df_index, scalars_pandas_df_index): - # Condition is a dataframe, other is None. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - cond_bf = dataframe_bf > 0 - cond_pd = dataframe_pd > 0 - - bf_result = dataframe_bf.where(cond_bf, None).to_pandas() - pd_result = dataframe_pd.where(cond_pd, None) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_dataframe_cond_const_other(scalars_df_index, scalars_pandas_df_index): - # Condition is a dataframe, other is a constant. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - cond_bf = dataframe_bf > 0 - cond_pd = dataframe_pd > 0 - other_bf = 10 - other_pd = 10 - - bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() - pd_result = dataframe_pd.where(cond_pd, other_pd) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_dataframe_cond_dataframe_other( - scalars_df_index, scalars_pandas_df_index -): - # Condition is a dataframe, other is a dataframe. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - cond_bf = dataframe_bf > 0 - cond_pd = dataframe_pd > 0 - other_bf = dataframe_bf * 2 - other_pd = dataframe_pd * 2 - - bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() - pd_result = dataframe_pd.where(cond_pd, other_pd) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_callable_cond_constant_other(scalars_df_index, scalars_pandas_df_index): - # Condition is callable, other is a constant. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - other = 10 - - bf_result = dataframe_bf.where(lambda x: x > 0, other).to_pandas() - pd_result = dataframe_pd.where(lambda x: x > 0, other) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_dataframe_cond_callable_other(scalars_df_index, scalars_pandas_df_index): - # Condition is a dataframe, other is callable. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - cond_bf = dataframe_bf > 0 - cond_pd = dataframe_pd > 0 - - def func(x): - return x * 2 - - bf_result = dataframe_bf.where(cond_bf, func).to_pandas() - pd_result = dataframe_pd.where(cond_pd, func) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_callable_cond_callable_other(scalars_df_index, scalars_pandas_df_index): - # Condition is callable, other is callable too. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - def func(x): - return x["int64_col"] > 0 - - bf_result = dataframe_bf.where(func, lambda x: x * 2).to_pandas() - pd_result = dataframe_pd.where(func, lambda x: x * 2) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_series_other(scalars_df_index): - # When other is a series, throw an error. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - - with pytest.raises( - ValueError, - match="Seires is not a supported replacement type!", - ): - dataframe_bf.where(dataframe_bf > 0, dataframe_bf["int64_col"]) - - -def test_drop_column(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - df_pandas = scalars_df.drop(columns=col_name).to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df.drop(columns=col_name).columns - ) - - -def test_drop_columns(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_names = ["int64_col", "geography_col", "time_col"] - df_pandas = scalars_df.drop(columns=col_names).to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df.drop(columns=col_names).columns - ) - - -def test_drop_labels_axis_1(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - labels = ["int64_col", "geography_col", "time_col"] - - pd_result = scalars_pandas_df.drop(labels=labels, axis=1) - bf_result = scalars_df.drop(labels=labels, axis=1).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_with_custom_column_labels(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - rename_mapping = { - "int64_col": "Integer Column", - "string_col": "言語列", - } - dropped_columns = [ - "言語列", - "timestamp_col", - ] - bf_df = scalars_df.rename(columns=rename_mapping).drop(columns=dropped_columns) - bf_result = bf_df.to_pandas() - pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( - columns=dropped_columns - ) - assert_pandas_df_equal(bf_result, pd_result) - - -def test_df_memory_usage(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - pd_result = scalars_pandas_df.memory_usage() - bf_result = scalars_df.memory_usage() - - pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) - - -def test_df_info(scalars_dfs): - expected = ( - "\n" - "Index: 9 entries, 0 to 8\n" - "Data columns (total 14 columns):\n" - " # Column Non-Null Count Dtype\n" - "--- ------------- ---------------- ------------------------------\n" - " 0 bool_col 8 non-null boolean\n" - " 1 bytes_col 6 non-null binary[pyarrow]\n" - " 2 date_col 7 non-null date32[day][pyarrow]\n" - " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" - " 4 geography_col 4 non-null geometry\n" - " 5 int64_col 8 non-null Int64\n" - " 6 int64_too 9 non-null Int64\n" - " 7 numeric_col 6 non-null decimal128(38, 9)[pyarrow]\n" - " 8 float64_col 7 non-null Float64\n" - " 9 rowindex_2 9 non-null Int64\n" - " 10 string_col 8 non-null string\n" - " 11 time_col 6 non-null time64[us][pyarrow]\n" - " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" - " 13 duration_col 7 non-null duration[us][pyarrow]\n" - "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), duration[us][pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" - "memory usage: 1341 bytes\n" - ) - - scalars_df, _ = scalars_dfs - bf_result = io.StringIO() - - scalars_df.info(buf=bf_result) - - assert expected == bf_result.getvalue() - - -@pytest.mark.parametrize( - ("include", "exclude"), - [ - ("Int64", None), - (["int"], None), - ("number", None), - ([pd.Int64Dtype(), pd.BooleanDtype()], None), - (None, [pd.Int64Dtype(), pd.BooleanDtype()]), - ("Int64", ["boolean"]), - ], -) -def test_select_dtypes(scalars_dfs, include, exclude): - scalars_df, scalars_pandas_df = scalars_dfs - - pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude) - bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - pd_result = scalars_pandas_df.drop(index=[4, 1, 2]) - bf_result = scalars_df.drop(index=[4, 1, 2]).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_pandas_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - drop_index = scalars_pandas_df.iloc[[4, 1, 2]].index - - pd_result = scalars_pandas_df.drop(index=drop_index) - bf_result = scalars_df.drop(index=drop_index).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_bigframes_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - drop_index = scalars_df.loc[[4, 1, 2]].index - drop_pandas_index = scalars_pandas_df.loc[[4, 1, 2]].index - - pd_result = scalars_pandas_df.drop(index=drop_pandas_index) - bf_result = scalars_df.drop(index=drop_index).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_bigframes_index_with_na(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.copy() - scalars_pandas_df = scalars_pandas_df.copy() - scalars_df = scalars_df.set_index("bytes_col") - scalars_pandas_df = scalars_pandas_df.set_index("bytes_col") - drop_index = scalars_df.iloc[[3, 5]].index - drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index - - pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index) - bf_result = scalars_df.drop(index=drop_index).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_bigframes_multiindex(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.copy() - scalars_pandas_df = scalars_pandas_df.copy() - sub_df = scalars_df.iloc[[4, 1, 2]] - sub_pandas_df = scalars_pandas_df.iloc[[4, 1, 2]] - sub_df = sub_df.set_index(["bytes_col", "numeric_col"]) - sub_pandas_df = sub_pandas_df.set_index(["bytes_col", "numeric_col"]) - drop_index = sub_df.index - drop_pandas_index = sub_pandas_df.index - - scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) - scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) - bf_result = scalars_df.drop(index=drop_index).to_pandas() - pd_result = scalars_pandas_df.drop(index=drop_pandas_index) - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_labels_axis_0(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - pd_result = scalars_pandas_df.drop(labels=[4, 1, 2], axis=0) - bf_result = scalars_df.drop(labels=[4, 1, 2], axis=0).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_index_and_columns(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - pd_result = scalars_pandas_df.drop(index=[4, 1, 2], columns="int64_col") - bf_result = scalars_df.drop(index=[4, 1, 2], columns="int64_col").to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_rename(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"bool_col": 1.2345} - df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns - ) - - -def test_df_peek(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - peek_result = scalars_df.peek(n=3, force=False, allow_large_results=True) - - pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) - assert len(peek_result) == 3 - - -def test_df_peek_with_large_results_not_allowed(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - peek_result = scalars_df.peek(n=3, force=False, allow_large_results=False) - - pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) - assert len(peek_result) == 3 - - -def test_df_peek_filtered(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - peek_result = scalars_df[scalars_df.int64_col != 0].peek(n=3, force=False) - pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) - assert len(peek_result) == 3 - - -def test_df_peek_exception(scalars_dfs): - scalars_df, _ = scalars_dfs - - with pytest.raises(ValueError): - # Window ops aren't compatible with efficient peeking - scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3, force=False) - - -def test_df_peek_force_default(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - peek_result = scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3) - pd.testing.assert_index_equal( - scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns - ) - assert len(peek_result) == 3 - - -def test_df_peek_reset_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - peek_result = ( - scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3) - ) - pd.testing.assert_index_equal( - scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns - ) - assert len(peek_result) == 3 - - -def test_repr_w_all_rows(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - # Remove columns with flaky formatting, like NUMERIC columns (which use the - # object dtype). Also makes a copy so that mutating the index name doesn't - # break other tests. - scalars_df = scalars_df.drop(columns=["numeric_col"]) - scalars_pandas_df = scalars_pandas_df.drop(columns=["numeric_col"]) - - # When there are 10 or fewer rows, the outputs should be identical. - actual = repr(scalars_df.head(10)) - - with display_options.pandas_repr(bigframes.options.display): - expected = repr(scalars_pandas_df.head(10)) - - assert actual == expected - - -def test_join_repr(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - scalars_df = ( - scalars_df[["int64_col"]] - .join(scalars_df.set_index("int64_col")[["int64_too"]]) - .sort_index() - ) - scalars_pandas_df = ( - scalars_pandas_df[["int64_col"]] - .join(scalars_pandas_df.set_index("int64_col")[["int64_too"]]) - .sort_index() - ) - # Pandas join result index name seems to depend on the index values in a way that bigframes can't match exactly - scalars_pandas_df.index.name = None - - actual = repr(scalars_df) - - with display_options.pandas_repr(bigframes.options.display): - expected = repr(scalars_pandas_df) - - assert actual == expected - - -def test_repr_w_display_options(scalars_dfs, session): - metrics = session._metrics - scalars_df, _ = scalars_dfs - # get a pandas df of the expected format - df, _ = scalars_df._block.to_pandas() - pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) - pandas_df.index.name = scalars_df.index.name - - executions_pre = metrics.execution_count - with bigframes.option_context( - "display.max_rows", 10, "display.max_columns", 5, "display.max_colwidth", 10 - ): - - # When there are 10 or fewer rows, the outputs should be identical except for the extra note. - actual = scalars_df.head(10).__repr__() - executions_post = metrics.execution_count - - with display_options.pandas_repr(bigframes.options.display): - pandas_repr = pandas_df.head(10).__repr__() - - assert actual == pandas_repr - assert (executions_post - executions_pre) <= 3 - - -def test_repr_html_w_all_rows(scalars_dfs, session): - metrics = session._metrics - scalars_df, _ = scalars_dfs - # get a pandas df of the expected format - df, _ = scalars_df._block.to_pandas() - pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) - pandas_df.index.name = scalars_df.index.name - - executions_pre = metrics.execution_count - # When there are 10 or fewer rows, the outputs should be identical except for the extra note. - actual = scalars_df.head(10)._repr_html_() - executions_post = metrics.execution_count - - with display_options.pandas_repr(bigframes.options.display): - pandas_repr = pandas_df.head(10)._repr_html_() - - expected = ( - pandas_repr - + f"[{len(pandas_df.index)} rows x {len(pandas_df.columns)} columns in total]" - ) - assert actual == expected - assert (executions_post - executions_pre) <= 3 - - -def test_df_column_name_with_space(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"bool_col": "bool col"} - df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns - ) - - -def test_df_column_name_duplicate(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"int64_too": "int64_col"} - df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns - ) - - -def test_get_df_column_name_duplicate(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"int64_too": "int64_col"} - - bf_result = scalars_df.rename(columns=col_name_dict)["int64_col"].to_pandas() - pd_result = scalars_pandas_df.rename(columns=col_name_dict)["int64_col"] - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) - - -@pytest.mark.parametrize( - ("indices", "axis"), - [ - ([1, 3, 5], 0), - ([2, 4, 6], 1), - ([1, -3, -5, -6], "index"), - ([-2, -4, -6], "columns"), - ], -) -def test_take_df(scalars_dfs, indices, axis): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.take(indices, axis=axis).to_pandas() - pd_result = scalars_pandas_df.take(indices, axis=axis) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_filter_df(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_bool_series = scalars_df["bool_col"] - bf_result = scalars_df[bf_bool_series].to_pandas() - - pd_bool_series = scalars_pandas_df["bool_col"] - pd_result = scalars_pandas_df[pd_bool_series] - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_read_gbq_direct_to_batches_row_count(unordered_session): - df = unordered_session.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") - iter = df.to_pandas_batches() - assert iter.total_rows == 5552452 - - -def test_df_to_pandas_batches(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - capped_unfiltered_batches = scalars_df.to_pandas_batches(page_size=2, max_results=6) - bf_bool_series = scalars_df["bool_col"] - filtered_batches = scalars_df[bf_bool_series].to_pandas_batches() - - pd_bool_series = scalars_pandas_df["bool_col"] - pd_result = scalars_pandas_df[pd_bool_series] - - assert 6 == capped_unfiltered_batches.total_rows - assert len(pd_result) == filtered_batches.total_rows - assert_pandas_df_equal(pd.concat(filtered_batches), pd_result) - - -@pytest.mark.parametrize( - ("literal", "expected_dtype"), - ( - pytest.param( - 2, - dtypes.INT_DTYPE, - id="INT64", - ), - # ==================================================================== - # NULL values - # - # These are regression tests for b/428999884. It needs to be possible to - # set a column to NULL with a desired type (not just the pandas default - # of float64). - # ==================================================================== - pytest.param(None, dtypes.FLOAT_DTYPE, id="NULL-None"), - pytest.param( - pa.scalar(None, type=pa.int64()), - dtypes.INT_DTYPE, - id="NULL-pyarrow-TIMESTAMP", - ), - pytest.param( - pa.scalar(None, type=pa.timestamp("us", tz="UTC")), - dtypes.TIMESTAMP_DTYPE, - id="NULL-pyarrow-TIMESTAMP", - ), - pytest.param( - pa.scalar(None, type=pa.timestamp("us")), - dtypes.DATETIME_DTYPE, - id="NULL-pyarrow-DATETIME", - ), - ), -) -def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype): - scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.assign(new_col=literal) - bf_result = df.to_pandas() - - new_col_pd = literal - if isinstance(literal, pa.Scalar): - # PyArrow integer scalars aren't yet supported in pandas Int64Dtype. - new_col_pd = literal.as_py() - - # Pandas might not pick the same dtype as BigFrames, but it should at least - # be castable to it. - pd_result = scalars_pandas_df.assign(new_col=new_col_pd) - pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_assign_new_column_w_loc(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.loc[:, "new_col"] = 2 - pd_df.loc[:, "new_col"] = 2 - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("scalar",), - [ - (2.1,), - (None,), - ], -) -def test_assign_new_column_w_setitem(scalars_dfs, scalar): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df["new_col"] = scalar - pd_df["new_col"] = scalar - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `float64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Float64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_new_column_w_setitem_dataframe(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df["int64_col"] = bf_df["int64_too"].to_frame() - pd_df["int64_col"] = pd_df["int64_too"].to_frame() - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_df["int64_col"] = pd_df["int64_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df) - - -def test_assign_new_column_w_setitem_dataframe_error(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - - with pytest.raises(ValueError): - bf_df["impossible_col"] = bf_df[["int64_too", "string_col"]] - with pytest.raises(ValueError): - pd_df["impossible_col"] = pd_df[["int64_too", "string_col"]] - - -def test_assign_new_column_w_setitem_list(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - bf_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] - pd_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - pd_result["new_col_2"] = pd_result["new_col_2"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - - # set the custom index - pd_df = pd_df.set_index(["string_col", "int64_col"]) - bf_df = bf_df.set_index(["string_col", "int64_col"]) - - bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_new_column_w_setitem_list_error(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - - with pytest.raises(ValueError): - pd_df["new_col"] = [1, 2, 3] # should be len 9, is 3 - with pytest.raises(ValueError): - bf_df["new_col"] = [1, 2, 3] - - -@pytest.mark.parametrize( - ("key", "value"), - [ - pytest.param(["int64_col", "int64_too"], 1, id="scalar_to_existing_column"), - pytest.param( - ["int64_col", "int64_too"], [1, 2], id="sequence_to_existing_column" - ), - pytest.param( - ["int64_col", "new_col"], [1, 2], id="sequence_to_partial_new_column" - ), - pytest.param( - ["new_col", "new_col_too"], [1, 2], id="sequence_to_full_new_column" - ), - pytest.param( - pd.Index(("new_col", "new_col_too")), - [1, 2], - id="sequence_to_full_new_column_as_index", - ), - ], -) -def test_setitem_multicolumn_with_literals(scalars_dfs, key, value): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.copy() - pd_result = scalars_pandas_df.copy() - - bf_result[key] = value - pd_result[key] = value - - pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False) - - -def test_setitem_multicolumn_with_literals_different_lengths_raise_error(scalars_dfs): - scalars_df, _ = scalars_dfs - bf_result = scalars_df.copy() - - with pytest.raises(ValueError): - bf_result[["int64_col", "int64_too"]] = [1] - - -def test_setitem_multicolumn_with_dataframes(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.copy() - pd_result = scalars_pandas_df.copy() - - bf_result[["int64_col", "int64_too"]] = bf_result[["int64_too", "int64_col"]] / 2 - pd_result[["int64_col", "int64_too"]] = pd_result[["int64_too", "int64_col"]] / 2 - - pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False) - - -def test_setitem_multicolumn_with_dataframes_series_on_rhs_raise_error(scalars_dfs): - scalars_df, _ = scalars_dfs - bf_result = scalars_df.copy() - - with pytest.raises(ValueError): - bf_result[["int64_col", "int64_too"]] = bf_result["int64_col"] / 2 - - -def test_setitem_multicolumn_with_dataframes_different_lengths_raise_error(scalars_dfs): - scalars_df, _ = scalars_dfs - bf_result = scalars_df.copy() - - with pytest.raises(ValueError): - bf_result[["int64_col"]] = bf_result[["int64_col", "int64_too"]] / 2 - - -def test_assign_existing_column(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - kwargs = {"int64_col": 2} - df = scalars_df.assign(**kwargs) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign(**kwargs) - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_assign_listlike_to_empty_df(session): - empty_df = dataframe.DataFrame(session=session) - empty_pandas_df = pd.DataFrame() - - bf_result = empty_df.assign(new_col=[1, 2, 3]) - pd_result = empty_pandas_df.assign(new_col=[1, 2, 3]) - - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - pd_result.index = pd_result.index.astype("Int64") - assert_pandas_df_equal(bf_result.to_pandas(), pd_result) - - -def test_assign_to_empty_df_multiindex_error(session): - empty_df = dataframe.DataFrame(session=session) - empty_pandas_df = pd.DataFrame() - - empty_df["empty_col_1"] = typing.cast(series.Series, []) - empty_df["empty_col_2"] = typing.cast(series.Series, []) - empty_pandas_df["empty_col_1"] = [] - empty_pandas_df["empty_col_2"] = [] - empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"]) - empty_pandas_df = empty_pandas_df.set_index(["empty_col_1", "empty_col_2"]) - - with pytest.raises(ValueError): - empty_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) - with pytest.raises(ValueError): - empty_pandas_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_assign_series(scalars_dfs, ordered): - scalars_df, scalars_pandas_df = scalars_dfs - column_name = "int64_col" - df = scalars_df.assign(new_col=scalars_df[column_name]) - bf_result = df.to_pandas(ordered=ordered) - pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) - - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) - - -def test_assign_series_overwrite(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - column_name = "int64_col" - df = scalars_df.assign(**{column_name: scalars_df[column_name] + 3}) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign( - **{column_name: scalars_pandas_df[column_name] + 3} - ) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_assign_sequential(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - kwargs = {"int64_col": 2, "new_col": 3, "new_col2": 4} - df = scalars_df.assign(**kwargs) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign(**kwargs) - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") - - assert_pandas_df_equal(bf_result, pd_result) - - -# Require an index so that the self-join is consistent each time. -def test_assign_same_table_different_index_performs_self_join( - scalars_df_index, scalars_pandas_df_index -): - column_name = "int64_col" - bf_df = scalars_df_index.assign( - alternative_index=scalars_df_index["rowindex_2"] + 2 - ) - pd_df = scalars_pandas_df_index.assign( - alternative_index=scalars_pandas_df_index["rowindex_2"] + 2 - ) - bf_df_2 = bf_df.set_index("alternative_index") - pd_df_2 = pd_df.set_index("alternative_index") - bf_result = bf_df.assign(new_col=bf_df_2[column_name] * 10).to_pandas() - pd_result = pd_df.assign(new_col=pd_df_2[column_name] * 10) - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -# Different table expression must have Index -def test_assign_different_df( - scalars_df_index, scalars_df_2_index, scalars_pandas_df_index -): - column_name = "int64_col" - df = scalars_df_index.assign(new_col=scalars_df_2_index[column_name]) - bf_result = df.to_pandas() - # Doesn't matter to pandas if it comes from the same DF or a different DF. - pd_result = scalars_pandas_df_index.assign( - new_col=scalars_pandas_df_index[column_name] - ) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_assign_different_df_w_loc( - scalars_df_index, scalars_df_2_index, scalars_pandas_df_index -): - bf_df = scalars_df_index.copy() - bf_df2 = scalars_df_2_index.copy() - pd_df = scalars_pandas_df_index.copy() - assert "int64_col" in bf_df.columns - assert "int64_col" in pd_df.columns - bf_df.loc[:, "int64_col"] = bf_df2.loc[:, "int64_col"] + 1 - pd_df.loc[:, "int64_col"] = pd_df.loc[:, "int64_col"] + 1 - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_different_df_w_setitem( - scalars_df_index, scalars_df_2_index, scalars_pandas_df_index -): - bf_df = scalars_df_index.copy() - bf_df2 = scalars_df_2_index.copy() - pd_df = scalars_pandas_df_index.copy() - assert "int64_col" in bf_df.columns - assert "int64_col" in pd_df.columns - bf_df["int64_col"] = bf_df2["int64_col"] + 1 - pd_df["int64_col"] = pd_df["int64_col"] + 1 - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_callable_lambda(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - kwargs = {"new_col": lambda x: x["int64_col"] + x["int64_too"]} - df = scalars_df.assign(**kwargs) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign(**kwargs) - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("axis", "how", "ignore_index", "subset"), - [ - (0, "any", False, None), - (0, "any", True, None), - (0, "all", False, ["bool_col", "time_col"]), - (0, "any", False, ["bool_col", "time_col"]), - (0, "all", False, "time_col"), - (1, "any", False, None), - (1, "all", False, None), - ], -) -def test_df_dropna_by_how(scalars_dfs, axis, how, ignore_index, subset): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna( - axis=axis, how=how, ignore_index=ignore_index, subset=subset - ) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("axis", "ignore_index", "subset", "thresh"), - [ - (0, False, None, 2), - (0, True, None, 3), - (1, False, None, 2), - ], -) -def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh): - """ - Tests that dropna correctly keeps rows/columns with a minimum number - of non-null values. - """ - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - df_result = scalars_df.dropna( - axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset - ) - pd_result = scalars_pandas_df.dropna( - axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset - ) - - bf_result = df_result.to_pandas() - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_dropna_range_columns(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.copy() - scalars_pandas_df = scalars_pandas_df.copy() - scalars_df.columns = pandas.RangeIndex(0, len(scalars_df.columns)) - scalars_pandas_df.columns = pandas.RangeIndex(0, len(scalars_pandas_df.columns)) - - df = scalars_df.dropna() - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna() - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_interpolate(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - columns = ["int64_col", "int64_too", "float64_col"] - bf_result = scalars_df[columns].interpolate().to_pandas() - # Pandas can only interpolate on "float64" columns - # https://github.com/pandas-dev/pandas/issues/40252 - pd_result = scalars_pandas_df[columns].astype("float64").interpolate() - - pandas.testing.assert_frame_equal( - bf_result, - pd_result, - check_index_type=False, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "col, fill_value", - [ - (["int64_col", "float64_col"], 3), - (["string_col"], "A"), - (["datetime_col"], pd.Timestamp("2023-01-01")), - ], -) -def test_df_fillna(scalars_dfs, col, fill_value): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col].fillna(fill_value).to_pandas() - pd_result = scalars_pandas_df[col].fillna(fill_value) - - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -def test_df_replace_scalar_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace(555.555, 3).to_pandas() - pd_result = scalars_pandas_df.replace(555.555, 3) - - # pandas has narrower result types as they are determined dynamically - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) - - -def test_df_replace_regex_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas() - pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True) - - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) - - -def test_df_replace_list_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace([555.555, 3.2], 3).to_pandas() - pd_result = scalars_pandas_df.replace([555.555, 3.2], 3) - - # pandas has narrower result types as they are determined dynamically - pd.testing.assert_frame_equal( - pd_result, - bf_result, - check_dtype=False, - ) - - -def test_df_replace_value_dict(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas() - pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200}) - - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) - - -def test_df_ffill(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas() - pd_result = scalars_pandas_df[["int64_col", "float64_col"]].ffill(limit=1) - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_bfill(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[["int64_col", "float64_col"]].bfill().to_pandas() - pd_result = scalars_pandas_df[["int64_col", "float64_col"]].bfill() - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_apply_series_series_callable( - scalars_df_index, - scalars_pandas_df_index, -): - columns = ["int64_too", "int64_col"] - - def foo(series, arg1, arg2, *, kwarg1=0, kwarg2=0): - return series**2 + (arg1 * arg2 % 4) + (kwarg1 * kwarg2 % 7) - - bf_result = ( - scalars_df_index[columns] - .apply(foo, args=(33, 61), kwarg1=52, kwarg2=21) - .to_pandas() - ) - - pd_result = scalars_pandas_df_index[columns].apply( - foo, args=(33, 61), kwarg1=52, kwarg2=21 - ) - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_apply_series_listlike_callable( - scalars_df_index, - scalars_pandas_df_index, -): - columns = ["int64_too", "int64_col"] - bf_result = ( - scalars_df_index[columns].apply(lambda x: [len(x), x.min(), 24]).to_pandas() - ) - - pd_result = scalars_pandas_df_index[columns].apply(lambda x: [len(x), x.min(), 24]) - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result.index = pd_result.index.astype("Int64") - pd_result = pd_result.astype("Int64") - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_apply_series_scalar_callable( - scalars_df_index, - scalars_pandas_df_index, -): - columns = ["int64_too", "int64_col"] - bf_result = scalars_df_index[columns].apply(lambda x: x.sum()) - - pd_result = scalars_pandas_df_index[columns].apply(lambda x: x.sum()) - - pandas.testing.assert_series_equal(bf_result, pd_result) - - -def test_df_pipe( - scalars_df_index, - scalars_pandas_df_index, -): - columns = ["int64_too", "int64_col"] - - def foo(x: int, y: int, df): - return (df + x) % y - - bf_result = ( - scalars_df_index[columns] - .pipe((foo, "df"), x=7, y=9) - .pipe(lambda x: x**2) - .to_pandas() - ) - - pd_result = ( - scalars_pandas_df_index[columns] - .pipe((foo, "df"), x=7, y=9) - .pipe(lambda x: x**2) - ) - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_keys( - scalars_df_index, - scalars_pandas_df_index, -): - pandas.testing.assert_index_equal( - scalars_df_index.keys(), scalars_pandas_df_index.keys() - ) - - -def test_df_iter( - scalars_df_index, - scalars_pandas_df_index, -): - for bf_i, df_i in zip(scalars_df_index, scalars_pandas_df_index): - assert bf_i == df_i - - -def test_iterrows( - scalars_df_index, - scalars_pandas_df_index, -): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df_index = scalars_df_index.add_suffix("_suffix", axis=1) - scalars_pandas_df_index = scalars_pandas_df_index.add_suffix("_suffix", axis=1) - for (bf_index, bf_series), (pd_index, pd_series) in zip( - scalars_df_index.iterrows(), scalars_pandas_df_index.iterrows() - ): - assert bf_index == pd_index - pandas.testing.assert_series_equal(bf_series, pd_series) - - -@pytest.mark.parametrize( - ( - "index", - "name", - ), - [ - ( - True, - "my_df", - ), - (False, None), - ], -) -def test_itertuples(scalars_df_index, index, name): - # Numeric has slightly different representation as a result of conversions. - bf_tuples = scalars_df_index.itertuples(index, name) - pd_tuples = scalars_df_index.to_pandas().itertuples(index, name) - for bf_tuple, pd_tuple in zip(bf_tuples, pd_tuples): - assert bf_tuple == pd_tuple - - -def test_df_isin_list_w_null(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - values = ["Hello, World!", 55555, 2.51, pd.NA, True] - bf_result = ( - scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] - .isin(values) - .to_pandas() - ) - pd_result = scalars_pandas_df[ - ["int64_col", "float64_col", "string_col", "bool_col"] - ].isin(values) - - pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) - - -def test_df_isin_list_wo_null(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - values = ["Hello, World!", 55555, 2.51, True] - bf_result = ( - scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] - .isin(values) - .to_pandas() - ) - pd_result = scalars_pandas_df[ - ["int64_col", "float64_col", "string_col", "bool_col"] - ].isin(values) - - pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) - - -def test_df_isin_dict(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - values = { - "string_col": ["Hello, World!", 55555, 2.51, pd.NA, True], - "int64_col": [5555, 2.51], - "bool_col": [pd.NA], - } - bf_result = ( - scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] - .isin(values) - .to_pandas() - ) - pd_result = scalars_pandas_df[ - ["int64_col", "float64_col", "string_col", "bool_col"] - ].isin(values) - - pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) - - -def test_df_cross_merge(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - left_columns = ["int64_col", "float64_col", "rowindex_2"] - right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] - - left = scalars_df[left_columns] - # Offset the rows somewhat so that outer join can have an effect. - right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) - - bf_result = left.merge(right, "cross").to_pandas() - - pd_result = scalars_pandas_df[left_columns].merge( - scalars_pandas_df[right_columns].assign( - rowindex_2=scalars_pandas_df["rowindex_2"] + 2 - ), - "cross", - ) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -@pytest.mark.parametrize( - ("merge_how",), - [ - ("inner",), - ("outer",), - ("left",), - ("right",), - ], -) -def test_df_merge(scalars_dfs, merge_how): - scalars_df, scalars_pandas_df = scalars_dfs - on = "rowindex_2" - left_columns = ["int64_col", "float64_col", "rowindex_2"] - right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] - - left = scalars_df[left_columns] - # Offset the rows somewhat so that outer join can have an effect. - right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) - - df = left.merge(right, merge_how, on, sort=True) - bf_result = df.to_pandas() - - pd_result = scalars_pandas_df[left_columns].merge( - scalars_pandas_df[right_columns].assign( - rowindex_2=scalars_pandas_df["rowindex_2"] + 2 - ), - merge_how, - on, - sort=True, - ) - - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("left_on", "right_on"), - [ - (["int64_col", "rowindex_2"], ["int64_col", "rowindex_2"]), - (["rowindex_2", "int64_col"], ["int64_col", "rowindex_2"]), - (["rowindex_2", "float64_col"], ["int64_col", "rowindex_2"]), - ], -) -def test_df_merge_multi_key(scalars_dfs, left_on, right_on): - scalars_df, scalars_pandas_df = scalars_dfs - left_columns = ["int64_col", "float64_col", "rowindex_2"] - right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] - - left = scalars_df[left_columns] - # Offset the rows somewhat so that outer join can have an effect. - right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) - - df = left.merge(right, "outer", left_on=left_on, right_on=right_on, sort=True) - bf_result = df.to_pandas() - - pd_result = scalars_pandas_df[left_columns].merge( - scalars_pandas_df[right_columns].assign( - rowindex_2=scalars_pandas_df["rowindex_2"] + 2 - ), - "outer", - left_on=left_on, - right_on=right_on, - sort=True, - ) - - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("merge_how",), - [ - ("inner",), - ("outer",), - ("left",), - ("right",), - ], -) -def test_merge_custom_col_name(scalars_dfs, merge_how): - scalars_df, scalars_pandas_df = scalars_dfs - left_columns = ["int64_col", "float64_col"] - right_columns = ["int64_col", "bool_col", "string_col"] - on = "int64_col" - rename_columns = {"float64_col": "f64_col"} - - left = scalars_df[left_columns] - left = left.rename(columns=rename_columns) - right = scalars_df[right_columns] - df = left.merge(right, merge_how, on, sort=True) - bf_result = df.to_pandas() - - pandas_left_df = scalars_pandas_df[left_columns] - pandas_left_df = pandas_left_df.rename(columns=rename_columns) - pandas_right_df = scalars_pandas_df[right_columns] - pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) - - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("merge_how",), - [ - ("inner",), - ("outer",), - ("left",), - ("right",), - ], -) -def test_merge_left_on_right_on(scalars_dfs, merge_how): - scalars_df, scalars_pandas_df = scalars_dfs - left_columns = ["int64_col", "float64_col", "int64_too"] - right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] - - left = scalars_df[left_columns] - right = scalars_df[right_columns] - - df = left.merge( - right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True - ) - bf_result = df.to_pandas() - - pd_result = scalars_pandas_df[left_columns].merge( - scalars_pandas_df[right_columns], - merge_how, - left_on="int64_too", - right_on="rowindex_2", - sort=True, - ) - - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) - - -def test_self_merge_self_w_on_args(): - data = { - "A": pd.Series([1, 2, 3], dtype="Int64"), - "B": pd.Series([1, 2, 3], dtype="Int64"), - "C": pd.Series([100, 200, 300], dtype="Int64"), - "D": pd.Series(["alpha", "beta", "gamma"], dtype="string[pyarrow]"), - } - df = pd.DataFrame(data) - - df1 = df[["A", "C"]] - df2 = df[["B", "C", "D"]] - pd_result = df1.merge(df2, left_on=["A", "C"], right_on=["B", "C"], how="inner") - - bf_df = bpd.DataFrame(data) - - bf_df1 = bf_df[["A", "C"]] - bf_df2 = bf_df[["B", "C", "D"]] - bf_result = bf_df1.merge( - bf_df2, left_on=["A", "C"], right_on=["B", "C"], how="inner" - ).to_pandas() - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -@pytest.mark.parametrize( - ("decimals",), - [ - (2,), - ({"float64_col": 0, "bool_col": 1, "int64_too": -3},), - ({},), - ], -) -def test_dataframe_round(scalars_dfs, decimals): - if pd.__version__.startswith("1."): - pytest.skip("Rounding doesn't work as expected in pandas 1.x") - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.round(decimals).to_pandas() - pd_result = scalars_pandas_df.round(decimals) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_get_dtypes(scalars_df_default_index): - dtypes = scalars_df_default_index.dtypes - dtypes_dict: Dict[str, bigframes.dtypes.Dtype] = { - "bool_col": pd.BooleanDtype(), - "bytes_col": pd.ArrowDtype(pa.binary()), - "date_col": pd.ArrowDtype(pa.date32()), - "datetime_col": pd.ArrowDtype(pa.timestamp("us")), - "geography_col": gpd.array.GeometryDtype(), - "int64_col": pd.Int64Dtype(), - "int64_too": pd.Int64Dtype(), - "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), - "float64_col": pd.Float64Dtype(), - "rowindex": pd.Int64Dtype(), - "rowindex_2": pd.Int64Dtype(), - "string_col": pd.StringDtype(storage="pyarrow"), - "time_col": pd.ArrowDtype(pa.time64("us")), - "timestamp_col": pd.ArrowDtype(pa.timestamp("us", tz="UTC")), - "duration_col": pd.ArrowDtype(pa.duration("us")), - } - pd.testing.assert_series_equal( - dtypes, - pd.Series(dtypes_dict), - ) - - -def test_get_dtypes_array_struct_query(session): - df = session.read_gbq( - """SELECT - [1, 3, 2] AS array_column, - STRUCT( - "a" AS string_field, - 1.2 AS float_field) AS struct_column""" - ) - - dtypes = df.dtypes - pd.testing.assert_series_equal( - dtypes, - pd.Series( - { - "array_column": pd.ArrowDtype(pa.list_(pa.int64())), - "struct_column": pd.ArrowDtype( - pa.struct( - [ - ("string_field", pa.string()), - ("float_field", pa.float64()), - ] - ) - ), - } - ), - ) - - -def test_get_dtypes_array_struct_table(nested_df): - dtypes = nested_df.dtypes - pd.testing.assert_series_equal( - dtypes, - pd.Series( - { - "customer_id": pd.StringDtype(storage="pyarrow"), - "day": pd.ArrowDtype(pa.date32()), - "flag": pd.Int64Dtype(), - "label": pd.ArrowDtype( - pa.struct( - [ - ("key", pa.string()), - ("value", pa.string()), - ] - ), - ), - "event_sequence": pd.ArrowDtype( - pa.list_( - pa.struct( - [ - pa.field( - "data", - pa.list_( - pa.struct( - [ - ("value", pa.float64()), - ("key", pa.string()), - ], - ), - ), - nullable=False, - ), - ("timestamp", pa.timestamp("us", "UTC")), - ("category", pa.string()), - ] - ), - ), - ), - "address": pd.ArrowDtype( - pa.struct( - [ - ("street", pa.string()), - ("city", pa.string()), - ] - ), - ), - } - ), - ) - - -def test_shape(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.shape - pd_result = scalars_pandas_df.shape - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - "reference_table, test_table", - [ - ( - "bigframes-dev.bigframes_tests_sys.base_table", - "bigframes-dev.bigframes_tests_sys.base_table_mat_view", - ), - ( - "bigframes-dev.bigframes_tests_sys.base_table", - "bigframes-dev.bigframes_tests_sys.base_table_view", - ), - ( - "bigframes-dev.bigframes_tests_sys.csv_native_table", - "bigframes-dev.bigframes_tests_sys.csv_external_table", - ), - ], -) -def test_view_and_external_table_shape(session, reference_table, test_table): - reference_df = session.read_gbq(reference_table) - test_df = session.read_gbq(test_table) - - assert test_df.shape == reference_df.shape - - -def test_len(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = len(scalars_df) - pd_result = len(scalars_pandas_df) - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("n_rows",), - [ - (50,), - (10000,), - ], -) -@pytest.mark.parametrize( - "write_engine", - ["bigquery_load", "bigquery_streaming", "bigquery_write"], -) -def test_df_len_local(session, n_rows, write_engine): - assert ( - len( - session.read_pandas( - pd.DataFrame(np.random.randint(1, 7, n_rows), columns=["one"]), - write_engine=write_engine, - ) - ) - == n_rows - ) - - -def test_size(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.size - pd_result = scalars_pandas_df.size - - assert bf_result == pd_result - - -def test_ndim(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.ndim - pd_result = scalars_pandas_df.ndim - - assert bf_result == pd_result - - -def test_empty_false(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.empty - pd_result = scalars_pandas_df.empty - - assert bf_result == pd_result - - -def test_empty_true_column_filter(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df[[]].empty - pd_result = scalars_pandas_df[[]].empty - - assert bf_result == pd_result - - -def test_empty_true_row_filter(scalars_dfs: Tuple[dataframe.DataFrame, pd.DataFrame]): - scalars_df, scalars_pandas_df = scalars_dfs - bf_bool: series.Series = typing.cast(series.Series, scalars_df["bool_col"]) - pd_bool: pd.Series = scalars_pandas_df["bool_col"] - bf_false = bf_bool.notna() & (bf_bool != bf_bool) - pd_false = pd_bool.notna() & (pd_bool != pd_bool) - - bf_result = scalars_df[bf_false].empty - pd_result = scalars_pandas_df[pd_false].empty - - assert pd_result - assert bf_result == pd_result - - -def test_empty_true_memtable(session: bigframes.Session): - bf_df = dataframe.DataFrame(session=session) - pd_df = pd.DataFrame() - - bf_result = bf_df.empty - pd_result = pd_df.empty - - assert pd_result - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("drop",), - ((True,), (False,)), -) -def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop): - df = scalars_df_index.reset_index(drop=drop) - assert df.index.name is None - - bf_result = df.to_pandas() - pd_result = scalars_pandas_df_index.reset_index(drop=drop) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.copy() - scalars_df_index.index.name = "int64_col" - df = scalars_df_index.reset_index(allow_duplicates=True, drop=False) - assert df.index.name is None - - bf_result = df.to_pandas() - - scalars_pandas_df_index = scalars_pandas_df_index.copy() - scalars_pandas_df_index.index.name = "int64_col" - pd_result = scalars_pandas_df_index.reset_index(allow_duplicates=True, drop=False) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_reset_index_duplicates_error(scalars_df_index): - scalars_df_index = scalars_df_index.copy() - scalars_df_index.index.name = "int64_col" - with pytest.raises(ValueError): - scalars_df_index.reset_index(allow_duplicates=False, drop=False) - - -@pytest.mark.parametrize( - ("drop",), - ((True,), (False,)), -) -def test_reset_index_inplace(scalars_df_index, scalars_pandas_df_index, drop): - df = scalars_df_index.copy() - df.reset_index(drop=drop, inplace=True) - assert df.index.name is None - - bf_result = df.to_pandas() - pd_result = scalars_pandas_df_index.copy() - pd_result.reset_index(drop=drop, inplace=True) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_reset_index_then_filter( - scalars_df_index, - scalars_pandas_df_index, -): - bf_filter = scalars_df_index["bool_col"].fillna(True) - bf_df = scalars_df_index.reset_index()[bf_filter] - bf_result = bf_df.to_pandas() - pd_filter = scalars_pandas_df_index["bool_col"].fillna(True) - pd_result = scalars_pandas_df_index.reset_index()[pd_filter] - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering and index keys - # post-filter will have gaps. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_reset_index_with_unnamed_index( - scalars_df_index, - scalars_pandas_df_index, -): - scalars_df_index = scalars_df_index.copy() - scalars_pandas_df_index = scalars_pandas_df_index.copy() - - scalars_df_index.index.name = None - scalars_pandas_df_index.index.name = None - df = scalars_df_index.reset_index(drop=False) - assert df.index.name is None - - # reset_index(drop=False) creates a new column "index". - assert df.columns[0] == "index" - - bf_result = df.to_pandas() - pd_result = scalars_pandas_df_index.reset_index(drop=False) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_reset_index_with_unnamed_multiindex( - scalars_df_index, - scalars_pandas_df_index, -): - bf_df = dataframe.DataFrame( - ([1, 2, 3], [2, 5, 7]), - index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), - ) - pd_df = pd.DataFrame( - ([1, 2, 3], [2, 5, 7]), - index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), - ) - - bf_df = bf_df.reset_index() - pd_df = pd_df.reset_index() - - assert pd_df.columns[0] == "level_0" - assert bf_df.columns[0] == "level_0" - assert pd_df.columns[1] == "level_1" - assert bf_df.columns[1] == "level_1" - - -def test_reset_index_with_unnamed_index_and_index_column( - scalars_df_index, - scalars_pandas_df_index, -): - scalars_df_index = scalars_df_index.copy() - scalars_pandas_df_index = scalars_pandas_df_index.copy() - - scalars_df_index.index.name = None - scalars_pandas_df_index.index.name = None - df = scalars_df_index.assign(index=scalars_df_index["int64_col"]).reset_index( - drop=False - ) - assert df.index.name is None - - # reset_index(drop=False) creates a new column "level_0" if the "index" column already exists. - assert df.columns[0] == "level_0" - - bf_result = df.to_pandas() - pd_result = scalars_pandas_df_index.assign( - index=scalars_pandas_df_index["int64_col"] - ).reset_index(drop=False) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("drop",), - ( - (True,), - (False,), - ), -) -@pytest.mark.parametrize( - ("append",), - ( - (True,), - (False,), - ), -) -@pytest.mark.parametrize( - ("index_column",), - (("int64_too",), ("string_col",), ("timestamp_col",)), -) -def test_set_index(scalars_dfs, index_column, drop, append): - scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.set_index(index_column, append=append, drop=drop) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.set_index(index_column, append=append, drop=drop) - - # Sort to disambiguate when there are duplicate index labels. - # Note: Doesn't use assert_pandas_df_equal_ignore_ordering because we get - # "ValueError: 'timestamp_col' is both an index level and a column label, - # which is ambiguous" when trying to sort by a column with the same name as - # the index. - bf_result = bf_result.sort_values("rowindex_2") - pd_result = pd_result.sort_values("rowindex_2") - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_set_index_key_error(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - with pytest.raises(KeyError): - scalars_pandas_df.set_index(["not_a_col"]) - with pytest.raises(KeyError): - scalars_df.set_index(["not_a_col"]) - - -@pytest.mark.parametrize( - ("ascending",), - ((True,), (False,)), -) -@pytest.mark.parametrize( - ("na_position",), - (("first",), ("last",)), -) -@pytest.mark.parametrize( - ("axis",), - ((0,), ("columns",)), -) -def test_sort_index(scalars_dfs, ascending, na_position, axis): - index_column = "int64_col" - scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.set_index(index_column) - bf_result = df.sort_index( - ascending=ascending, na_position=na_position, axis=axis - ).to_pandas() - pd_result = scalars_pandas_df.set_index(index_column).sort_index( - ascending=ascending, na_position=na_position, axis=axis - ) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_dataframe_sort_index_inplace(scalars_dfs): - index_column = "int64_col" - scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.copy().set_index(index_column) - df.sort_index(ascending=False, inplace=True) - bf_result = df.to_pandas() - - pd_result = scalars_pandas_df.set_index(index_column).sort_index(ascending=False) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_abs(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - columns = ["int64_col", "int64_too", "float64_col"] - - bf_result = scalars_df[columns].abs() - pd_result = scalars_pandas_df[columns].abs() - - assert_dfs_equivalent(pd_result, bf_result) - - -def test_df_pos(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (+scalars_df[["int64_col", "numeric_col"]]).to_pandas() - pd_result = +scalars_pandas_df[["int64_col", "numeric_col"]] - - assert_pandas_df_equal(pd_result, bf_result) - - -def test_df_neg(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (-scalars_df[["int64_col", "numeric_col"]]).to_pandas() - pd_result = -scalars_pandas_df[["int64_col", "numeric_col"]] - - assert_pandas_df_equal(pd_result, bf_result) - - -def test_df__abs__(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = ( - abs(scalars_df[["int64_col", "numeric_col", "float64_col"]]) - ).to_pandas() - pd_result = abs(scalars_pandas_df[["int64_col", "numeric_col", "float64_col"]]) - - assert_pandas_df_equal(pd_result, bf_result) - - -def test_df_invert(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - columns = ["int64_col", "bool_col"] - - bf_result = (~scalars_df[columns]).to_pandas() - pd_result = ~scalars_pandas_df[columns] - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_df_isnull(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - columns = ["int64_col", "int64_too", "string_col", "bool_col"] - bf_result = scalars_df[columns].isnull().to_pandas() - pd_result = scalars_pandas_df[columns].isnull() - - # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is - # `BooleanDtype` but the `pd_result.dtype` is `bool`. - pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) - pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) - pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) - pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_df_notnull(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - columns = ["int64_col", "int64_too", "string_col", "bool_col"] - bf_result = scalars_df[columns].notnull().to_pandas() - pd_result = scalars_pandas_df[columns].notnull() - - # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is - # `BooleanDtype` but the `pd_result.dtype` is `bool`. - pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) - pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) - pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) - pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("left_labels", "right_labels", "overwrite", "fill_value"), - [ - (["a", "b", "c"], ["c", "a", "b"], True, None), - (["a", "b", "c"], ["c", "a", "b"], False, None), - (["a", "b", "c"], ["a", "b", "c"], False, 2), - ], - ids=[ - "one_one_match_overwrite", - "one_one_match_no_overwrite", - "exact_match", - ], -) -def test_combine( - scalars_df_index, - scalars_df_2_index, - scalars_pandas_df_index, - left_labels, - right_labels, - overwrite, - fill_value, -): - if pd.__version__.startswith("1."): - pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") - columns = ["int64_too", "int64_col", "float64_col"] - - bf_df_a = scalars_df_index[columns] - bf_df_a.columns = left_labels - bf_df_b = scalars_df_2_index[columns] - bf_df_b.columns = right_labels - bf_result = bf_df_a.combine( - bf_df_b, - lambda x, y: x**2 + 2 * x * y + y**2, - overwrite=overwrite, - fill_value=fill_value, - ).to_pandas() - - pd_df_a = scalars_pandas_df_index[columns] - pd_df_a.columns = left_labels - pd_df_b = scalars_pandas_df_index[columns] - pd_df_b.columns = right_labels - pd_result = pd_df_a.combine( - pd_df_b, - lambda x, y: x**2 + 2 * x * y + y**2, - overwrite=overwrite, - fill_value=fill_value, - ) - - # Some dtype inconsistency for all-NULL columns - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.parametrize( - ("overwrite", "filter_func"), - [ - (True, None), - (False, None), - (True, lambda x: x.isna() | (x % 2 == 0)), - ], - ids=[ - "default", - "overwritefalse", - "customfilter", - ], -) -def test_df_update(overwrite, filter_func): - if pd.__version__.startswith("1."): - pytest.skip("dtype handled differently in pandas 1.x.") - - index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") - - index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") - pd_df1 = pandas.DataFrame( - {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 - ) - pd_df2 = pandas.DataFrame( - {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, - dtype="Int64", - index=index2, - ) - - bf_df1 = dataframe.DataFrame(pd_df1) - bf_df2 = dataframe.DataFrame(pd_df2) - - bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func) - pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func) - - pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) - - -def test_df_idxmin(): - pd_df = pd.DataFrame( - {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] - ) - bf_df = dataframe.DataFrame(pd_df) - - bf_result = bf_df.idxmin().to_pandas() - pd_result = pd_df.idxmin() - - pd.testing.assert_series_equal( - bf_result, pd_result, check_index_type=False, check_dtype=False - ) - - -def test_df_idxmax(): - pd_df = pd.DataFrame( - {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] - ) - bf_df = dataframe.DataFrame(pd_df) - - bf_result = bf_df.idxmax().to_pandas() - pd_result = pd_df.idxmax() - - pd.testing.assert_series_equal( - bf_result, pd_result, check_index_type=False, check_dtype=False - ) - - -@pytest.mark.parametrize( - ("join", "axis"), - [ - ("outer", None), - ("outer", 0), - ("outer", 1), - ("left", 0), - ("right", 1), - ("inner", None), - ("inner", 1), - ], -) -def test_df_align(join, axis): - - index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") - - index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") - pd_df1 = pandas.DataFrame( - {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 - ) - pd_df2 = pandas.DataFrame( - {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, - dtype="Int64", - index=index2, - ) - - bf_df1 = dataframe.DataFrame(pd_df1) - bf_df2 = dataframe.DataFrame(pd_df2) - - bf_result1, bf_result2 = bf_df1.align(bf_df2, join=join, axis=axis) - pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) - - # Don't check dtype as pandas does unnecessary float conversion - assert isinstance(bf_result1, dataframe.DataFrame) and isinstance( - bf_result2, dataframe.DataFrame - ) - pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) - pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) - - -def test_combine_first( - scalars_df_index, - scalars_df_2_index, - scalars_pandas_df_index, -): - if pd.__version__.startswith("1."): - pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") - columns = ["int64_too", "int64_col", "float64_col"] - - bf_df_a = scalars_df_index[columns].iloc[0:6] - bf_df_a.columns = ["a", "b", "c"] - bf_df_b = scalars_df_2_index[columns].iloc[2:8] - bf_df_b.columns = ["b", "a", "d"] - bf_result = bf_df_a.combine_first(bf_df_b).to_pandas() - - pd_df_a = scalars_pandas_df_index[columns].iloc[0:6] - pd_df_a.columns = ["a", "b", "c"] - pd_df_b = scalars_pandas_df_index[columns].iloc[2:8] - pd_df_b.columns = ["b", "a", "d"] - pd_result = pd_df_a.combine_first(pd_df_b) - - # Some dtype inconsistency for all-NULL columns - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.parametrize( - ("columns", "numeric_only"), - [ - (["bool_col", "int64_col", "float64_col"], True), - (["bool_col", "int64_col", "float64_col"], False), - (["bool_col", "int64_col", "float64_col", "string_col"], True), - pytest.param( - ["bool_col", "int64_col", "float64_col", "string_col"], - False, - marks=pytest.mark.xfail( - raises=NotImplementedError, - ), - ), - ], -) -def test_df_corr_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - bf_result = scalars_df[columns].corr(numeric_only=numeric_only).to_pandas() - pd_result = scalars_pandas_df[columns].corr(numeric_only=numeric_only) - - # BigFrames and Pandas differ in their data type handling: - # - Column types: BigFrames uses Float64, Pandas uses float64. - # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) - # Only check row order in ordered mode. - pd.testing.assert_frame_equal( - bf_result, - pd_result, - check_dtype=False, - check_index_type=False, - check_like=~scalars_df._block.session._strictly_ordered, - ) - - -def test_df_corr_w_invalid_parameters(scalars_dfs): - columns = ["int64_too", "int64_col", "float64_col"] - scalars_df, _ = scalars_dfs - - with pytest.raises(NotImplementedError): - scalars_df[columns].corr(method="kendall") - - with pytest.raises(NotImplementedError): - scalars_df[columns].corr(min_periods=1) - - -@pytest.mark.parametrize( - ("columns", "numeric_only"), - [ - (["bool_col", "int64_col", "float64_col"], True), - (["bool_col", "int64_col", "float64_col"], False), - (["bool_col", "int64_col", "float64_col", "string_col"], True), - pytest.param( - ["bool_col", "int64_col", "float64_col", "string_col"], - False, - marks=pytest.mark.xfail( - raises=NotImplementedError, - ), - ), - ], -) -def test_cov_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas() - pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only) - # BigFrames and Pandas differ in their data type handling: - # - Column types: BigFrames uses Float64, Pandas uses float64. - # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) - # Only check row order in ordered mode. - pd.testing.assert_frame_equal( - bf_result, - pd_result, - check_dtype=False, - check_index_type=False, - check_like=~scalars_df._block.session._strictly_ordered, - ) - - -def test_df_corrwith_df(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - l_cols = ["int64_col", "float64_col", "int64_too"] - r_cols = ["int64_too", "float64_col"] - - bf_result = scalars_df[l_cols].corrwith(scalars_df[r_cols]).to_pandas() - pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_cols]) - - # BigFrames and Pandas differ in their data type handling: - # - Column types: BigFrames uses Float64, Pandas uses float64. - # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_df_corrwith_df_numeric_only(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] - r_cols = ["int64_too", "float64_col", "bool_col"] - - bf_result = ( - scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=True).to_pandas() - ) - pd_result = scalars_pandas_df[l_cols].corrwith( - scalars_pandas_df[r_cols], numeric_only=True - ) - - # BigFrames and Pandas differ in their data type handling: - # - Column types: BigFrames uses Float64, Pandas uses float64. - # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_df_corrwith_df_non_numeric_error(scalars_dfs): - scalars_df, _ = scalars_dfs - - l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] - r_cols = ["int64_too", "float64_col", "bool_col"] - - with pytest.raises(NotImplementedError): - scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=False) - - -def test_df_corrwith_series(scalars_dfs_maybe_ordered): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - l_cols = ["int64_col", "float64_col", "int64_too"] - r_col = "float64_col" - - bf_result = scalars_df[l_cols].corrwith(scalars_df[r_col]).to_pandas() - pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_col]) - - # BigFrames and Pandas differ in their data type handling: - # - Column types: BigFrames uses Float64, Pandas uses float64. - # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("op"), - [ - operator.add, - operator.sub, - operator.mul, - operator.truediv, - operator.floordiv, - operator.eq, - operator.ne, - operator.gt, - operator.ge, - operator.lt, - operator.le, - ], - ids=[ - "add", - "subtract", - "multiply", - "true_divide", - "floor_divide", - "eq", - "ne", - "gt", - "ge", - "lt", - "le", - ], -) -# TODO(garrettwu): deal with NA values -@pytest.mark.parametrize(("other_scalar"), [1, 2.5, 0, 0.0]) -@pytest.mark.parametrize(("reverse_operands"), [True, False]) -def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): - scalars_df, scalars_pandas_df = scalars_dfs - columns = ["int64_col", "float64_col"] - - maybe_reversed_op = (lambda x, y: op(y, x)) if reverse_operands else op - - bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() - pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_dataframe_string_radd_const(scalars_dfs): - pytest.importorskip( - "pandas", - minversion="2.0.0", - reason="PyArrow string addition requires pandas 2.0+", - ) - - scalars_df, scalars_pandas_df = scalars_dfs - columns = ["string_col", "string_col"] - - bf_result = ("prefix" + scalars_df[columns]).to_pandas() - pd_result = "prefix" + scalars_pandas_df[columns] - - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.parametrize(("other_scalar"), [1, -2]) -def test_mod(scalars_dfs, other_scalar): - # Zero case excluded as pandas produces 0 result for Int64 inputs rather than NA/NaN. - # This is likely a pandas bug as mod 0 is undefined in other dtypes, and most programming languages. - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() - pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_scalar_binop_str_exception(scalars_dfs): - scalars_df, _ = scalars_dfs - columns = ["string_col"] - with pytest.raises(TypeError, match="Cannot add dtypes"): - (scalars_df[columns] + 1).to_pandas() - - -@pytest.mark.parametrize( - ("op"), - [ - (lambda x, y: x.add(y, axis="index")), - (lambda x, y: x.radd(y, axis="index")), - (lambda x, y: x.sub(y, axis="index")), - (lambda x, y: x.rsub(y, axis="index")), - (lambda x, y: x.mul(y, axis="index")), - (lambda x, y: x.rmul(y, axis="index")), - (lambda x, y: x.truediv(y, axis="index")), - (lambda x, y: x.rtruediv(y, axis="index")), - (lambda x, y: x.floordiv(y, axis="index")), - (lambda x, y: x.floordiv(y, axis="index")), - (lambda x, y: x.gt(y, axis="index")), - (lambda x, y: x.ge(y, axis="index")), - (lambda x, y: x.lt(y, axis="index")), - (lambda x, y: x.le(y, axis="index")), - ], - ids=[ - "add", - "radd", - "sub", - "rsub", - "mul", - "rmul", - "truediv", - "rtruediv", - "floordiv", - "rfloordiv", - "gt", - "ge", - "lt", - "le", - ], -) -def test_series_binop_axis_index( - scalars_dfs, - op, -): - scalars_df, scalars_pandas_df = scalars_dfs - df_columns = ["int64_col", "float64_col"] - series_column = "int64_too" - - bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() - pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) - - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("input"), - [ - ((1000, 2000, 3000)), - (pd.Index([1000, 2000, 3000])), - (pd.Series((1000, 2000), index=["int64_too", "float64_col"])), - ], - ids=[ - "tuple", - "pd_index", - "pd_series", - ], -) -def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - df_columns = ["int64_col", "float64_col", "int64_too"] - - bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas() - if hasattr(input, "to_pandas"): - input = input.to_pandas() - pd_result = scalars_pandas_df[df_columns].add(input, axis=1) - - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -def test_df_reverse_binop_pandas(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - pd_series = pd.Series([100, 200, 300]) - - df_columns = ["int64_col", "float64_col", "int64_too"] - - bf_result = pd_series + scalars_df[df_columns].to_pandas() - pd_result = pd_series + scalars_pandas_df[df_columns] - - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -def test_listlike_binop_axis_1_bf_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - df_columns = ["int64_col", "float64_col", "int64_too"] - - bf_result = ( - scalars_df[df_columns] - .add(bf_indexes.Index([1000, 2000, 3000]), axis=1) - .to_pandas() - ) - pd_result = scalars_pandas_df[df_columns].add(pd.Index([1000, 2000, 3000]), axis=1) - - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -def test_binop_with_self_aggregate(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - df_columns = ["int64_col", "float64_col", "int64_too"] - - # Ensure that this takes the optimized single-query path by counting executions - execution_count_before = scalars_df._session._metrics.execution_count - bf_df = scalars_df[df_columns] - bf_result = (bf_df - bf_df.mean()).to_pandas() - execution_count_after = scalars_df._session._metrics.execution_count - - pd_df = scalars_pandas_df[df_columns] - pd_result = pd_df - pd_df.mean() - - executions = execution_count_after - execution_count_before - - assert executions == 1 - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -def test_binop_with_self_aggregate_w_index_reset(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - df_columns = ["int64_col", "float64_col", "int64_too"] - - # Ensure that this takes the optimized single-query path by counting executions - execution_count_before = scalars_df._session._metrics.execution_count - bf_df = scalars_df[df_columns].reset_index(drop=True) - bf_result = (bf_df - bf_df.mean()).to_pandas() - execution_count_after = scalars_df._session._metrics.execution_count - - pd_df = scalars_pandas_df[df_columns].reset_index(drop=True) - pd_result = pd_df - pd_df.mean() - - executions = execution_count_after - execution_count_before - - assert executions == 1 - pd_result.index = pd_result.index.astype("Int64") - assert_pandas_df_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("left_labels", "right_labels"), - [ - (["a", "a", "b"], ["c", "c", "d"]), - (["a", "b", "c"], ["c", "a", "b"]), - (["a", "c", "c"], ["c", "a", "c"]), - (["a", "b", "c"], ["a", "b", "c"]), - ], - ids=[ - "no_overlap", - "one_one_match", - "multi_match", - "exact_match", - ], -) -def test_binop_df_df_binary_op( - scalars_df_index, - scalars_df_2_index, - scalars_pandas_df_index, - left_labels, - right_labels, -): - if pd.__version__.startswith("1."): - pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") - columns = ["int64_too", "int64_col", "float64_col"] - - bf_df_a = scalars_df_index[columns] - bf_df_a.columns = left_labels - bf_df_b = scalars_df_2_index[columns] - bf_df_b.columns = right_labels - bf_result = (bf_df_a - bf_df_b).to_pandas() - - pd_df_a = scalars_pandas_df_index[columns] - pd_df_a.columns = left_labels - pd_df_b = scalars_pandas_df_index[columns] - pd_df_b.columns = right_labels - pd_result = pd_df_a - pd_df_b - - # Some dtype inconsistency for all-NULL columns - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -# Differnt table will only work for explicit index, since default index orders are arbitrary. -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_series_binop_add_different_table( - scalars_df_index, scalars_pandas_df_index, scalars_df_2_index, ordered -): - df_columns = ["int64_col", "float64_col"] - series_column = "int64_too" - - bf_result = ( - scalars_df_index[df_columns] - .add(scalars_df_2_index[series_column], axis="index") - .to_pandas(ordered=ordered) - ) - pd_result = scalars_pandas_df_index[df_columns].add( - scalars_pandas_df_index[series_column], axis="index" - ) - - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) - - -# TODO(garrettwu): Test series binop with different index - -all_joins = pytest.mark.parametrize( - ("how",), - (("outer",), ("left",), ("right",), ("inner",), ("cross",)), -) - - -@all_joins -def test_join_same_table(scalars_dfs_maybe_ordered, how): - bf_df, pd_df = scalars_dfs_maybe_ordered - - bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] - bf_df_a = bf_df_a.sort_index() - - bf_df_b = bf_df.set_index("int64_too")[["float64_col"]] - bf_df_b = bf_df_b[bf_df_b.float64_col > 0] - bf_df_b = bf_df_b.sort_values("float64_col") - - bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() - - pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]].sort_index() - pd_df_a = pd_df_a.sort_index() - - pd_df_b = pd_df.set_index("int64_too")[["float64_col"]] - pd_df_b = pd_df_b[pd_df_b.float64_col > 0] - pd_df_b = pd_df_b.sort_values("float64_col") - - pd_result = pd_df_a.join(pd_df_b, how=how) - - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) - - -def test_join_incompatible_key_type_error(scalars_dfs): - bf_df, _ = scalars_dfs - - bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] - bf_df_a = bf_df_a.sort_index() - - bf_df_b = bf_df.set_index("date_col")[["float64_col"]] - bf_df_b = bf_df_b[bf_df_b.float64_col > 0] - bf_df_b = bf_df_b.sort_values("float64_col") - - with pytest.raises(TypeError): - # joining incompatible date, int columns - bf_df_a.join(bf_df_b, how="left") - - -@all_joins -def test_join_different_table( - scalars_df_index, scalars_df_2_index, scalars_pandas_df_index, how -): - bf_df_a = scalars_df_index[["string_col", "int64_col"]] - bf_df_b = scalars_df_2_index.dropna()[["float64_col"]] - bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() - pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] - pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] - pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) - - -@all_joins -def test_join_different_table_with_duplicate_column_name( - scalars_df_index, scalars_pandas_df_index, how -): - bf_df_a = scalars_df_index[["string_col", "int64_col", "int64_too"]].rename( - columns={"int64_too": "int64_col"} - ) - bf_df_b = scalars_df_index.dropna()[ - ["string_col", "int64_col", "int64_too"] - ].rename(columns={"int64_too": "int64_col"}) - bf_result = bf_df_a.join(bf_df_b, how=how, lsuffix="_l", rsuffix="_r").to_pandas() - pd_df_a = scalars_pandas_df_index[["string_col", "int64_col", "int64_too"]].rename( - columns={"int64_too": "int64_col"} - ) - pd_df_b = scalars_pandas_df_index.dropna()[ - ["string_col", "int64_col", "int64_too"] - ].rename(columns={"int64_too": "int64_col"}) - pd_result = pd_df_a.join(pd_df_b, how=how, lsuffix="_l", rsuffix="_r") - - # Ensure no inplace changes - pd.testing.assert_index_equal(bf_df_a.columns, pd_df_a.columns) - pd.testing.assert_index_equal(bf_df_b.index.to_pandas(), pd_df_b.index) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -@all_joins -def test_join_param_on_with_duplicate_column_name_not_on_col( - scalars_df_index, scalars_pandas_df_index, how -): - # This test is for duplicate column names, but the 'on' column is not duplicated. - if how == "cross": - return - bf_df_a = scalars_df_index[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - bf_df_b = scalars_df_index.dropna()[ - ["string_col", "datetime_col", "timestamp_col"] - ].rename(columns={"timestamp_col": "datetime_col"}) - bf_result = bf_df_a.join( - bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" - ).to_pandas() - pd_df_a = scalars_pandas_df_index[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - pd_df_b = scalars_pandas_df_index.dropna()[ - ["string_col", "datetime_col", "timestamp_col"] - ].rename(columns={"timestamp_col": "datetime_col"}) - pd_result = pd_df_a.join( - pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" - ) - pd.testing.assert_frame_equal( - bf_result.sort_index(), - pd_result.sort_index(), - check_like=True, - check_index_type=False, - check_names=False, - ) - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) - - -@pytest.mark.skipif( - pandas.__version__.startswith("1."), reason="bad left join in pandas 1.x" -) -@all_joins -def test_join_param_on_with_duplicate_column_name_on_col( - scalars_df_index, scalars_pandas_df_index, how -): - # This test is for duplicate column names, and the 'on' column is duplicated. - if how == "cross": - return - bf_df_a = scalars_df_index[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - bf_df_b = scalars_df_index.dropna()[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - bf_result = bf_df_a.join( - bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" - ).to_pandas() - pd_df_a = scalars_pandas_df_index[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - pd_df_b = scalars_pandas_df_index.dropna()[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - pd_result = pd_df_a.join( - pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" - ) - pd.testing.assert_frame_equal( - bf_result.sort_index(), - pd_result.sort_index(), - check_like=True, - check_index_type=False, - check_names=False, - ) - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) - - -@all_joins -def test_join_param_on(scalars_dfs, how): - bf_df, pd_df = scalars_dfs - - bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] - bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) - bf_df_b = bf_df[["float64_col"]] - - if how == "cross": - with pytest.raises(ValueError): - bf_df_a.join(bf_df_b, on="rowindex_2", how=how) - else: - bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() - - pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] - pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) - pd_df_b = pd_df[["float64_col"]] - pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) - - -@all_joins -def test_df_join_series(scalars_dfs, how): - bf_df, pd_df = scalars_dfs - - bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] - bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) - bf_series_b = bf_df["float64_col"] - - if how == "cross": - with pytest.raises(ValueError): - bf_df_a.join(bf_series_b, on="rowindex_2", how=how) - else: - bf_result = bf_df_a.join(bf_series_b, on="rowindex_2", how=how).to_pandas() - - pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] - pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) - pd_series_b = pd_df["float64_col"] - pd_result = pd_df_a.join(pd_series_b, on="rowindex_2", how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) - - -@pytest.mark.parametrize( - ("by", "ascending", "na_position"), - [ - ("int64_col", True, "first"), - (["bool_col", "int64_col"], True, "last"), - ("int64_col", False, "first"), - (["bool_col", "int64_col"], [False, True], "last"), - (["bool_col", "int64_col"], [True, False], "first"), - ], -) -def test_dataframe_sort_values( - scalars_df_index, scalars_pandas_df_index, by, ascending, na_position -): - # Test needs values to be unique - bf_result = scalars_df_index.sort_values( - by, ascending=ascending, na_position=na_position - ).to_pandas() - pd_result = scalars_pandas_df_index.sort_values( - by, ascending=ascending, na_position=na_position - ) - - pandas.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("by", "ascending", "na_position"), - [ - ("int64_col", True, "first"), - (["bool_col", "int64_col"], True, "last"), - ], -) -def test_dataframe_sort_values_inplace( - scalars_df_index, scalars_pandas_df_index, by, ascending, na_position -): - # Test needs values to be unique - bf_sorted = scalars_df_index.copy() - bf_sorted.sort_values( - by, ascending=ascending, na_position=na_position, inplace=True - ) - bf_result = bf_sorted.to_pandas() - pd_result = scalars_pandas_df_index.sort_values( - by, ascending=ascending, na_position=na_position - ) - - pandas.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_dataframe_sort_values_invalid_input(scalars_df_index): - with pytest.raises(KeyError): - scalars_df_index.sort_values(by=scalars_df_index["int64_col"]) - - -def test_dataframe_sort_values_stable(scalars_df_index, scalars_pandas_df_index): - bf_result = ( - scalars_df_index.sort_values("int64_col", kind="stable") - .sort_values("bool_col", kind="stable") - .to_pandas() - ) - pd_result = scalars_pandas_df_index.sort_values( - "int64_col", kind="stable" - ).sort_values("bool_col", kind="stable") - - pandas.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("operator", "columns"), - [ - pytest.param(lambda x: x.cumsum(), ["float64_col", "int64_too"]), - pytest.param(lambda x: x.cumprod(), ["float64_col", "int64_too"]), - pytest.param( - lambda x: x.cumprod(), - ["string_col"], - marks=pytest.mark.xfail( - raises=ValueError, - ), - ), - ], - ids=[ - "cumsum", - "cumprod", - "non-numeric", - ], -) -def test_dataframe_numeric_analytic_op( - scalars_df_index, scalars_pandas_df_index, operator, columns -): - # TODO: Add nullable ints (pandas 1.x has poor behavior on these) - bf_series = operator(scalars_df_index[columns]) - pd_series = operator(scalars_pandas_df_index[columns]) - bf_result = bf_series.to_pandas() - pd.testing.assert_frame_equal(pd_series, bf_result, check_dtype=False) - - -@pytest.mark.parametrize( - ("operator"), - [ - (lambda x: x.cummin()), - (lambda x: x.cummax()), - (lambda x: x.shift(2)), - (lambda x: x.shift(-2)), - ], - ids=[ - "cummin", - "cummax", - "shiftpostive", - "shiftnegative", - ], -) -def test_dataframe_general_analytic_op( - scalars_df_index, scalars_pandas_df_index, operator -): - col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] - bf_series = operator(scalars_df_index[col_names]) - pd_series = operator(scalars_pandas_df_index[col_names]) - bf_result = bf_series.to_pandas() - pd.testing.assert_frame_equal( - pd_series, - bf_result, - ) - - -@pytest.mark.parametrize( - ("periods",), - [ - (1,), - (2,), - (-1,), - ], -) -def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods): - col_names = ["int64_too", "float64_col", "int64_col"] - bf_result = scalars_df_index[col_names].diff(periods=periods).to_pandas() - pd_result = scalars_pandas_df_index[col_names].diff(periods=periods) - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) - - -@pytest.mark.parametrize( - ("periods",), - [ - (1,), - (2,), - (-1,), - ], -) -def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods): - col_names = ["int64_too", "float64_col", "int64_col"] - bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() - pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods) - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) - - -def test_dataframe_agg_single_string(scalars_dfs): - numeric_cols = ["int64_col", "int64_too", "float64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df[numeric_cols].agg("sum").to_pandas() - pd_result = scalars_pandas_df[numeric_cols].agg("sum") - - assert bf_result.dtype == "Float64" - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("agg",), - ( - ("sum",), - ("size",), - ), -) -def test_dataframe_agg_int_single_string(scalars_dfs, agg): - numeric_cols = ["int64_col", "int64_too", "bool_col"] - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df[numeric_cols].agg(agg).to_pandas() - pd_result = scalars_pandas_df[numeric_cols].agg(agg) - - assert bf_result.dtype == "Int64" - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_dataframe_agg_multi_string(scalars_dfs_maybe_ordered): - numeric_cols = ["int64_col", "int64_too", "float64_col"] - aggregations = [ - "sum", - "mean", - "median", - "std", - "var", - "min", - "max", - "nunique", - "count", - ] - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - bf_result = scalars_df[numeric_cols].agg(aggregations) - pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - # Drop median, as it's an approximation. - bf_median = bf_result.loc["median", :] - bf_result = bf_result.drop(labels=["median"]) - pd_result = pd_result.drop(labels=["median"]) - - assert_dfs_equivalent(pd_result, bf_result, check_index_type=False) - - # Double-check that median is at least plausible. - assert ( - (bf_result.loc["min", :] <= bf_median) & (bf_median <= bf_result.loc["max", :]) - ).all() - - -def test_dataframe_agg_int_multi_string(scalars_dfs): - numeric_cols = ["int64_col", "int64_too", "bool_col"] - aggregations = [ - "sum", - "nunique", - "count", - "size", - ] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas() - pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) - - for dtype in bf_result.dtypes: - assert dtype == "Int64" - - # Pandas may produce narrower numeric types - # Pandas has object index type - pd.testing.assert_frame_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_df_transpose(): - # Include some floats to ensure type coercion - values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] - # Test complex case of both axes being multi-indices with non-unique elements - - columns: pandas.Index = pd.Index( - ["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow") - ) - columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"]) - - index: pandas.Index = pd.Index( - ["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow") - ) - rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) - - pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) - bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) - - pd_result = pd_df.T - bf_result = bf_df.T.to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) - - -def test_df_transpose_error(): - with pytest.raises(TypeError, match="Cannot coerce.*to a common type."): - dataframe.DataFrame([[1, "hello"], [2, "world"]]).transpose() - - -def test_df_transpose_repeated_uses_cache(): - bf_df = dataframe.DataFrame([[1, 2.5], [2, 3.5]]) - pd_df = pandas.DataFrame([[1, 2.5], [2, 3.5]]) - # Transposing many times so that operation will fail from complexity if not using cache - for i in range(10): - # Cache still works even with simple scalar binop - bf_df = bf_df.transpose() + i - pd_df = pd_df.transpose() + i - - pd.testing.assert_frame_equal( - pd_df, bf_df.to_pandas(), check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_df_stack(scalars_dfs, ordered): - if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): - pytest.skip("pandas <2.1 uses different stack implementation") - scalars_df, scalars_pandas_df = scalars_dfs - # To match bigquery dataframes - scalars_pandas_df = scalars_pandas_df.copy() - scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") - # Can only stack identically-typed columns - columns = ["int64_col", "int64_too", "rowindex_2"] - - bf_result = scalars_df[columns].stack().to_pandas(ordered=ordered) - pd_result = scalars_pandas_df[columns].stack(future_stack=True) - - # Pandas produces NaN, where bq dataframes produces pd.NA - assert_series_equal( - bf_result, pd_result, check_dtype=False, ignore_order=not ordered - ) - - -def test_df_melt_default(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - # To match bigquery dataframes - scalars_pandas_df = scalars_pandas_df.copy() - scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") - # Can only stack identically-typed columns - columns = ["int64_col", "int64_too", "rowindex_2"] - - bf_result = scalars_df[columns].melt().to_pandas() - pd_result = scalars_pandas_df[columns].melt() - - # Pandas produces int64 index, Bigframes produces Int64 (nullable) - pd.testing.assert_frame_equal( - bf_result, - pd_result, - check_index_type=False, - check_dtype=False, - ) - - -def test_df_melt_parameterized(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - # To match bigquery dataframes - scalars_pandas_df = scalars_pandas_df.copy() - scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") - # Can only stack identically-typed columns - - bf_result = scalars_df.melt( - var_name="alice", - value_name="bob", - id_vars=["string_col"], - value_vars=["int64_col", "int64_too"], - ).to_pandas() - pd_result = scalars_pandas_df.melt( - var_name="alice", - value_name="bob", - id_vars=["string_col"], - value_vars=["int64_col", "int64_too"], - ) - - # Pandas produces int64 index, Bigframes produces Int64 (nullable) - pd.testing.assert_frame_equal( - bf_result, pd_result, check_index_type=False, check_dtype=False - ) - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_df_unstack(scalars_dfs, ordered): - scalars_df, scalars_pandas_df = scalars_dfs - # To match bigquery dataframes - scalars_pandas_df = scalars_pandas_df.copy() - scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") - # Can only stack identically-typed columns - columns = [ - "rowindex_2", - "int64_col", - "int64_too", - ] - - # unstack on mono-index produces series - bf_result = scalars_df[columns].unstack().to_pandas(ordered=ordered) - pd_result = scalars_pandas_df[columns].unstack() - - # Pandas produces NaN, where bq dataframes produces pd.NA - assert_series_equal( - bf_result, pd_result, check_dtype=False, ignore_order=not ordered - ) - - -@pytest.mark.parametrize( - ("values", "index", "columns"), - [ - ("int64_col", "int64_too", ["string_col"]), - (["int64_col"], "int64_too", ["string_col"]), - (["int64_col", "float64_col"], "int64_too", ["string_col"]), - ], -) -def test_df_pivot(scalars_dfs, values, index, columns): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.pivot( - values=values, index=index, columns=columns - ).to_pandas() - pd_result = scalars_pandas_df.pivot(values=values, index=index, columns=columns) - - # Pandas produces NaN, where bq dataframes produces pd.NA - bf_result = bf_result.fillna(float("nan")) - pd_result = pd_result.fillna(float("nan")) - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.parametrize( - ("values", "index", "columns"), - [ - (["goals", "assists"], ["team_name", "season"], ["position"]), - (["goals", "assists"], ["season"], ["team_name", "position"]), - ], -) -def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): - bf_result = ( - hockey_df.reset_index() - .pivot(values=values, index=index, columns=columns) - .to_pandas() - ) - pd_result = hockey_pandas_df.reset_index().pivot( - values=values, index=index, columns=columns - ) - - # Pandas produces NaN, where bq dataframes produces pd.NA - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.parametrize( - ("values", "index", "columns", "aggfunc"), - [ - (("culmen_length_mm", "body_mass_g"), "species", "sex", "std"), - (["body_mass_g", "culmen_length_mm"], ("species", "island"), "sex", "sum"), - ("body_mass_g", "sex", ["island", "species"], "mean"), - ("culmen_depth_mm", "island", "species", "max"), - ], -) -def test_df_pivot_table( - penguins_df_default_index, - penguins_pandas_df_default_index, - values, - index, - columns, - aggfunc, -): - bf_result = penguins_df_default_index.pivot_table( - values=values, index=index, columns=columns, aggfunc=aggfunc - ).to_pandas() - pd_result = penguins_pandas_df_default_index.pivot_table( - values=values, index=index, columns=columns, aggfunc=aggfunc - ) - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_column_type=False - ) - - -def test_ipython_key_completions_with_drop(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_names = "string_col" - bf_dataframe = scalars_df.drop(columns=col_names) - pd_dataframe = scalars_pandas_df.drop(columns=col_names) - expected = pd_dataframe.columns.tolist() - - results = bf_dataframe._ipython_key_completions_() - - assert col_names not in results - assert results == expected - # _ipython_key_completions_ is called with square brackets - # so only column names are relevant with tab completion - assert "to_gbq" not in results - assert "merge" not in results - assert "drop" not in results - - -def test_ipython_key_completions_with_rename(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"string_col": "a_renamed_column"} - bf_dataframe = scalars_df.rename(columns=col_name_dict) - pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) - expected = pd_dataframe.columns.tolist() - - results = bf_dataframe._ipython_key_completions_() - - assert "string_col" not in results - assert "a_renamed_column" in results - assert results == expected - # _ipython_key_completions_ is called with square brackets - # so only column names are relevant with tab completion - assert "to_gbq" not in results - assert "merge" not in results - assert "drop" not in results - - -def test__dir__with_drop(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_names = "string_col" - bf_dataframe = scalars_df.drop(columns=col_names) - pd_dataframe = scalars_pandas_df.drop(columns=col_names) - expected = pd_dataframe.columns.tolist() - - results = dir(bf_dataframe) - - assert col_names not in results - assert frozenset(expected) <= frozenset(results) - # __dir__ is called with a '.' and displays all methods, columns names, etc. - assert "to_gbq" in results - assert "merge" in results - assert "drop" in results - - -def test__dir__with_rename(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"string_col": "a_renamed_column"} - bf_dataframe = scalars_df.rename(columns=col_name_dict) - pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) - expected = pd_dataframe.columns.tolist() - - results = dir(bf_dataframe) - - assert "string_col" not in results - assert "a_renamed_column" in results - assert frozenset(expected) <= frozenset(results) - # __dir__ is called with a '.' and displays all methods, columns names, etc. - assert "to_gbq" in results - assert "merge" in results - assert "drop" in results - - -def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index[["int64_col", "int64_col", "int64_too"]].to_pandas() - pd_result = scalars_pandas_df_index[["int64_col", "int64_col", "int64_too"]] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("start", "stop", "step"), - [ - (0, 0, None), - (None, None, None), - (1, None, None), - (None, 4, None), - (None, None, 2), - (None, 50000000000, 1), - (5, 4, None), - (3, None, 2), - (1, 7, 2), - (1, 7, 50000000000), - ], -) -def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): - bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() - pd_result = scalars_pandas_df_index.iloc[start:stop:step] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("start", "stop", "step"), - [ - (0, 0, None), - ], -) -def test_iloc_slice_after_cache( - scalars_df_index, scalars_pandas_df_index, start, stop, step -): - scalars_df_index.cache() - bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() - pd_result = scalars_pandas_df_index.iloc[start:stop:step] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_iloc_slice_zero_step(scalars_df_index): - with pytest.raises(ValueError): - scalars_df_index.iloc[0:0:0] - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): - bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) - pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] - - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) - - -@pytest.mark.parametrize( - "index", - [0, 5, -2, (2,)], -) -def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): - bf_result = scalars_df_index.iloc[index] - pd_result = scalars_pandas_df_index.iloc[index] - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - "index", - [(2, 5), (5, 0), (0, 0)], -) -def test_iloc_tuple(scalars_df_index, scalars_pandas_df_index, index): - bf_result = scalars_df_index.iloc[index] - pd_result = scalars_pandas_df_index.iloc[index] - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - "index", - [(slice(None), [1, 2, 3]), (slice(1, 7, 2), [2, 5, 3])], -) -def test_iloc_tuple_multi_columns(scalars_df_index, scalars_pandas_df_index, index): - bf_result = scalars_df_index.iloc[index].to_pandas() - pd_result = scalars_pandas_df_index.iloc[index] - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_iloc_tuple_multi_columns_single_row(scalars_df_index, scalars_pandas_df_index): - index = (2, [2, 1, 3, -4]) - bf_result = scalars_df_index.iloc[index] - pd_result = scalars_pandas_df_index.iloc[index] - pd.testing.assert_series_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("index", "error"), - [ - ((1, 1, 1), pd.errors.IndexingError), - (("asd", "asd", "asd"), pd.errors.IndexingError), - (("asd"), TypeError), - ], -) -def test_iloc_tuple_errors(scalars_df_index, scalars_pandas_df_index, index, error): - with pytest.raises(error): - scalars_df_index.iloc[index] - with pytest.raises(error): - scalars_pandas_df_index.iloc[index] - - -@pytest.mark.parametrize( - "index", - [(2, 5), (5, 0), (0, 0)], -) -def test_iat(scalars_df_index, scalars_pandas_df_index, index): - bf_result = scalars_df_index.iat[index] - pd_result = scalars_pandas_df_index.iat[index] - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("index", "error"), - [ - (0, TypeError), - ("asd", ValueError), - ((1, 2, 3), TypeError), - (("asd", "asd"), ValueError), - ], -) -def test_iat_errors(scalars_df_index, scalars_pandas_df_index, index, error): - with pytest.raises(error): - scalars_pandas_df_index.iat[index] - with pytest.raises(error): - scalars_df_index.iat[index] - - -def test_iloc_single_integer_out_of_bound_error(scalars_df_index): - with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): - scalars_df_index.iloc[99] - - -def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[scalars_df_index.bool_col].to_pandas() - pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index.bool_col] - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_loc_list_select_rows_and_columns(scalars_df_index, scalars_pandas_df_index): - idx_list = [0, 3, 5] - bf_result = scalars_df_index.loc[idx_list, ["bool_col", "int64_col"]].to_pandas() - pd_result = scalars_pandas_df_index.loc[idx_list, ["bool_col", "int64_col"]] - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[:, "int64_col"].to_pandas() - pd_result = scalars_pandas_df_index.loc[:, "int64_col"] - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[:, scalars_df_index.dtypes == "Int64"].to_pandas() - pd_result = scalars_pandas_df_index.loc[ - :, scalars_pandas_df_index.dtypes == "Int64" - ] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_loc_select_with_column_condition_bf_series( - scalars_df_index, scalars_pandas_df_index -): - # (b/347072677) GEOGRAPH type doesn't support DISTINCT op - columns = [ - item for item in scalars_pandas_df_index.columns if item != "geography_col" - ] - scalars_df_index = scalars_df_index[columns] - scalars_pandas_df_index = scalars_pandas_df_index[columns] - - size_half = len(scalars_pandas_df_index) / 2 - bf_result = scalars_df_index.loc[ - :, scalars_df_index.nunique() > size_half - ].to_pandas() - pd_result = scalars_pandas_df_index.loc[ - :, scalars_pandas_df_index.nunique() > size_half - ] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("string_col", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index( - "string_col", drop=False - ) - index = "Hello, World!" - bf_result = scalars_df_index.loc[index] - pd_result = scalars_pandas_df_index.loc[index] - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) - index = -2345 - bf_result = scalars_df_index.loc[index] - pd_result = scalars_pandas_df_index.loc[index] - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_at_with_duplicate(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("string_col", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index( - "string_col", drop=False - ) - index = "Hello, World!" - bf_result = scalars_df_index.at[index, "int64_too"] - pd_result = scalars_pandas_df_index.at[index, "int64_too"] - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_at_no_duplicate(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) - index = -2345 - bf_result = scalars_df_index.at[index, "string_col"] - pd_result = scalars_pandas_df_index.at[index, "string_col"] - assert bf_result == pd_result - - -def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.loc[bf_df["int64_too"] == 0, "new_col"] = 99 - pd_df.loc[pd_df["int64_too"] == 0, "new_col"] = 99 - - # pandas uses float64 instead - pd_df["new_col"] = pd_df["new_col"].astype("Float64") - - pd.testing.assert_frame_equal( - bf_df.to_pandas(), - pd_df, - ) - - -@pytest.mark.parametrize( - ("col", "value"), - [ - ("string_col", "hello"), - ("int64_col", 3), - ("float64_col", 3.5), - ], -) -def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs, col, value): - if pd.__version__.startswith("1."): - pytest.skip("this loc overload not supported in pandas 1.x.") - - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.loc[bf_df["int64_too"] == 1, col] = value - pd_df.loc[pd_df["int64_too"] == 1, col] = value - - pd.testing.assert_frame_equal( - bf_df.to_pandas(), - pd_df, - ) - - -def test_loc_setitem_bool_series_scalar_error(scalars_dfs): - if pd.__version__.startswith("1."): - pytest.skip("this loc overload not supported in pandas 1.x.") - - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - - with pytest.raises(Exception): - bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = 99 - with pytest.raises(Exception): - pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99 - - -@pytest.mark.parametrize( - ("col", "op"), - [ - # Int aggregates - pytest.param("int64_col", lambda x: x.sum(), id="int-sum"), - pytest.param("int64_col", lambda x: x.min(), id="int-min"), - pytest.param("int64_col", lambda x: x.max(), id="int-max"), - pytest.param("int64_col", lambda x: x.count(), id="int-count"), - pytest.param("int64_col", lambda x: x.nunique(), id="int-nunique"), - # Float aggregates - pytest.param("float64_col", lambda x: x.count(), id="float-count"), - pytest.param("float64_col", lambda x: x.nunique(), id="float-nunique"), - # Bool aggregates - pytest.param("bool_col", lambda x: x.sum(), id="bool-sum"), - pytest.param("bool_col", lambda x: x.count(), id="bool-count"), - pytest.param("bool_col", lambda x: x.nunique(), id="bool-nunique"), - # String aggregates - pytest.param("string_col", lambda x: x.count(), id="string-count"), - pytest.param("string_col", lambda x: x.nunique(), id="string-nunique"), - ], -) -def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, op): - bf_result = op(scalars_df_index[[col]]).to_pandas() - pd_result = op(scalars_pandas_df_index[[col]]) - - # Check dtype separately - assert bf_result.dtype == "Int64" - # Is otherwise "object" dtype - pd_result.index = pd_result.index.astype("string[pyarrow]") - # Pandas may produce narrower numeric types - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) - - -@pytest.mark.parametrize( - ("col", "op"), - [ - pytest.param("bool_col", lambda x: x.min(), id="bool-min"), - pytest.param("bool_col", lambda x: x.max(), id="bool-max"), - ], -) -def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col, op): - bf_result = op(scalars_df_index[[col]]).to_pandas() - pd_result = op(scalars_pandas_df_index[[col]]) - - # Check dtype separately - assert bf_result.dtype == "boolean" - - # Pandas may produce narrower numeric types - # Pandas has object index type - pd_result.index = pd_result.index.astype("string[pyarrow]") - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) - - -@pytest.mark.parametrize( - ("op", "bf_dtype"), - [ - (lambda x: x.sum(numeric_only=True), "Float64"), - (lambda x: x.mean(numeric_only=True), "Float64"), - (lambda x: x.min(numeric_only=True), "Float64"), - (lambda x: x.max(numeric_only=True), "Float64"), - (lambda x: x.std(numeric_only=True), "Float64"), - (lambda x: x.var(numeric_only=True), "Float64"), - (lambda x: x.count(numeric_only=False), "Int64"), - (lambda x: x.nunique(), "Int64"), - ], - ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], -) -def test_dataframe_aggregates(scalars_dfs_maybe_ordered, op, bf_dtype): - scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered - col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] - bf_series = op(scalars_df_index[col_names]) - bf_result = bf_series - pd_result = op(scalars_pandas_df_index[col_names]) - - # Check dtype separately - assert bf_result.dtype == bf_dtype - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - # Pandas has object index type - pd_result.index = pd_result.index.astype("string[pyarrow]") - assert_series_equivalent( - pd_result, - bf_result, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - ("op"), - [ - (lambda x: x.sum(axis=1, numeric_only=True)), - (lambda x: x.mean(axis=1, numeric_only=True)), - (lambda x: x.min(axis=1, numeric_only=True)), - (lambda x: x.max(axis=1, numeric_only=True)), - (lambda x: x.std(axis=1, numeric_only=True)), - (lambda x: x.var(axis=1, numeric_only=True)), - ], - ids=["sum", "mean", "min", "max", "std", "var"], -) -def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index, op): - col_names = ["int64_too", "int64_col", "float64_col", "bool_col", "string_col"] - bf_result = op(scalars_df_index[col_names]).to_pandas() - pd_result = op(scalars_pandas_df_index[col_names]) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - # Pandas has object index type - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) - - -def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): - col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] - bf_result = scalars_df_index[col_names].median(numeric_only=True).to_pandas() - pd_result = scalars_pandas_df_index[col_names].agg(["min", "max"]) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - # Median is an approximation, but double-check that median is plausible. - for col in col_names: - assert (pd_result.loc["min", col] <= bf_result[col]) and ( - bf_result[col] <= pd_result.loc["max", col] - ) - - -def test_dataframe_aggregates_quantile_mono(scalars_df_index, scalars_pandas_df_index): - q = 0.45 - col_names = ["int64_too", "int64_col", "float64_col"] - bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() - pd_result = scalars_pandas_df_index[col_names].quantile(q=q) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) - - -def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df_index): - q = [0, 0.33, 0.67, 1.0] - col_names = ["int64_too", "int64_col", "float64_col"] - bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() - pd_result = scalars_pandas_df_index[col_names].quantile(q=q) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - pd_result.index = pd_result.index.astype("Float64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("op"), - [ - (lambda x: x.all(bool_only=True)), - (lambda x: x.any(bool_only=True)), - (lambda x: x.all(axis=1, bool_only=True)), - (lambda x: x.any(axis=1, bool_only=True)), - ], - ids=["all_axis0", "any_axis0", "all_axis1", "any_axis1"], -) -def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op): - # Pandas will drop nullable 'boolean' dtype so we convert first to bool, then cast back later - scalars_df_index = scalars_df_index.assign( - bool_col=scalars_df_index.bool_col.fillna(False) - ) - scalars_pandas_df_index = scalars_pandas_df_index.assign( - bool_col=scalars_pandas_df_index.bool_col.fillna(False).astype("bool") - ) - bf_series = op(scalars_df_index) - pd_series = op(scalars_pandas_df_index).astype("boolean") - bf_result = bf_series.to_pandas() - - pd_series.index = pd_series.index.astype(bf_result.index.dtype) - pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) - - -def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): - col_names = ["int64_too", "float64_col"] - bf_series = scalars_df_index[col_names].prod() - pd_series = scalars_pandas_df_index[col_names].prod() - bf_result = bf_series.to_pandas() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_series = pd_series.astype("Float64") - # Pandas has object index type - pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) - - -def test_df_skew_too_few_values(scalars_dfs): - columns = ["float64_col", "int64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].head(2).skew().to_pandas() - pd_result = scalars_pandas_df[columns].head(2).skew() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_df_skew(scalars_dfs, ordered): - columns = ["float64_col", "int64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].skew().to_pandas(ordered=ordered) - pd_result = scalars_pandas_df[columns].skew() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - assert_series_equal( - pd_result, bf_result, check_index_type=False, ignore_order=not ordered - ) - - -def test_df_kurt_too_few_values(scalars_dfs): - columns = ["float64_col", "int64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].head(2).kurt().to_pandas() - pd_result = scalars_pandas_df[columns].head(2).kurt() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) - - -def test_df_kurt(scalars_dfs): - columns = ["float64_col", "int64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].kurt().to_pandas() - pd_result = scalars_pandas_df[columns].kurt() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) - - -@pytest.mark.parametrize( - ("frac", "n", "random_state"), - [ - (None, 4, None), - (0.5, None, None), - (None, 4, 10), - (0.5, None, 10), - (None, None, None), - ], - ids=[ - "n_wo_random_state", - "frac_wo_random_state", - "n_w_random_state", - "frac_w_random_state", - "n_default", - ], -) -def test_sample(scalars_dfs, frac, n, random_state): - scalars_df, _ = scalars_dfs - df = scalars_df.sample(frac=frac, n=n, random_state=random_state) - bf_result = df.to_pandas() - - n = 1 if n is None else n - expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n - assert bf_result.shape[0] == expected_sample_size - assert bf_result.shape[1] == scalars_df.shape[1] - - -def test_sample_determinism(penguins_df_default_index): - df = penguins_df_default_index.sample(n=100, random_state=12345).head(15) - bf_result = df.to_pandas() - bf_result2 = df.to_pandas() - - pandas.testing.assert_frame_equal(bf_result, bf_result2) - - -def test_sample_raises_value_error(scalars_dfs): - scalars_df, _ = scalars_dfs - with pytest.raises( - ValueError, match="Only one of 'n' or 'frac' parameter can be specified." - ): - scalars_df.sample(frac=0.5, n=4) - - -def test_sample_args_sort(scalars_dfs): - scalars_df, _ = scalars_dfs - index = [4, 3, 2, 5, 1, 0] - scalars_df = scalars_df.iloc[index] - - kwargs = {"frac": 1.0, "random_state": 333} - - df = scalars_df.sample(**kwargs).to_pandas() - assert df.index.values != index - assert df.index.values != sorted(index) - - df = scalars_df.sample(sort="random", **kwargs).to_pandas() - assert df.index.values != index - assert df.index.values != sorted(index) - - df = scalars_df.sample(sort=True, **kwargs).to_pandas() - assert df.index.values == sorted(index) - - df = scalars_df.sample(sort=False, **kwargs).to_pandas() - assert df.index.values == index - - -@pytest.mark.parametrize( - ("axis",), - [ - (None,), - (0,), - (1,), - ], -) -def test_df_add_prefix(scalars_df_index, scalars_pandas_df_index, axis): - if pd.__version__.startswith("1."): - pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") - bf_result = scalars_df_index.add_prefix("prefix_", axis).to_pandas() - - pd_result = scalars_pandas_df_index.add_prefix("prefix_", axis) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - ("axis",), - [ - (0,), - (1,), - ], -) -def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): - if pd.__version__.startswith("1."): - pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") - bf_result = scalars_df_index.add_suffix("_suffix", axis).to_pandas() - - pd_result = scalars_pandas_df_index.add_suffix("_suffix", axis) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - check_index_type=False, - ) - - -def test_df_astype_error_error(session): - input = pd.DataFrame(["hello", "world", "3.11", "4000"]) - with pytest.raises(ValueError): - session.read_pandas(input).astype("Float64", errors="bad_value") - - -def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): - if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): - pytest.skip("pandas filter items behavior different pre-2.1") - bf_result = scalars_df_index.filter(items=["string_col", "int64_col"]).to_pandas() - - pd_result = scalars_pandas_df_index.filter(items=["string_col", "int64_col"]) - # Ignore column ordering as pandas order differently depending on version - pd.testing.assert_frame_equal( - bf_result.sort_index(axis=1), - pd_result.sort_index(axis=1), - ) - - -def test_df_columns_filter_like(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.filter(like="64_col").to_pandas() - - pd_result = scalars_pandas_df_index.filter(like="64_col") - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_columns_filter_regex(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.filter(regex="^[^_]+$").to_pandas() - - pd_result = scalars_pandas_df_index.filter(regex="^[^_]+$") - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): - if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): - pytest.skip("pandas filter items behavior different pre-2.1") - bf_result = scalars_df_index.filter(items=[5, 1, 3], axis=0).to_pandas() - - pd_result = scalars_pandas_df_index.filter(items=[5, 1, 3], axis=0) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - # Ignore ordering as pandas order differently depending on version - assert_pandas_df_equal( - bf_result, - pd_result, - ignore_order=True, - check_names=False, - ) - - -def test_df_rows_filter_like(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.copy().set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") - - bf_result = scalars_df_index.filter(like="ello", axis=0).to_pandas() - - pd_result = scalars_pandas_df_index.filter(like="ello", axis=0) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_rows_filter_regex(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.copy().set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") - - bf_result = scalars_df_index.filter(regex="^[GH].*", axis=0).to_pandas() - - pd_result = scalars_pandas_df_index.filter(regex="^[GH].*", axis=0) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_reindex_rows_list(scalars_dfs_maybe_ordered): - scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered - bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]) - - pd_result = scalars_pandas_df_index.reindex(index=[5, 1, 3, 99, 1]) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - assert_dfs_equivalent( - pd_result, - bf_result, - ) - - -def test_df_reindex_rows_index(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.reindex( - index=pd.Index([5, 1, 3, 99, 1], name="newname") - ).to_pandas() - - pd_result = scalars_pandas_df_index.reindex( - index=pd.Index([5, 1, 3, 99, 1], name="newname") - ) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_reindex_nonunique(scalars_df_index): - with pytest.raises(ValueError): - # int64_too is non-unique - scalars_df_index.set_index("int64_too").reindex( - index=[5, 1, 3, 99, 1], validate=True - ) - - -def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.reindex( - columns=["not_a_col", "int64_col", "int64_too"] - ).to_pandas() - - pd_result = scalars_pandas_df_index.reindex( - columns=["not_a_col", "int64_col", "int64_too"] - ) - - # Pandas uses float64 as default for newly created empty column, bf uses Float64 - pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_reindex_columns_with_same_order(scalars_df_index, scalars_pandas_df_index): - # First, make sure the two dataframes have the same columns in order. - columns = ["int64_col", "int64_too"] - bf = scalars_df_index[columns] - pd_df = scalars_pandas_df_index[columns] - - bf_result = bf.reindex(columns=columns).to_pandas() - pd_result = pd_df.reindex(columns=columns) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index): - unsupported = [ - "geography_col", - ] - scalars_df_index = scalars_df_index.drop(columns=unsupported) - scalars_pandas_df_index = scalars_pandas_df_index.drop(columns=unsupported) - - bf_result = scalars_df_index.equals(scalars_df_index) - pd_result = scalars_pandas_df_index.equals(scalars_pandas_df_index) - - assert pd_result == bf_result - - -def test_df_equals_series(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index[["int64_col"]].equals(scalars_df_index["int64_col"]) - pd_result = scalars_pandas_df_index[["int64_col"]].equals( - scalars_pandas_df_index["int64_col"] - ) - - assert pd_result == bf_result - - -def test_df_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): - columns = ["int64_col", "int64_too"] - scalars_df_index = scalars_df_index[columns] - scalars_pandas_df_index = scalars_pandas_df_index[columns] - - bf_modified = scalars_df_index.copy() - bf_modified = bf_modified.astype("Float64") - - pd_modified = scalars_pandas_df_index.copy() - pd_modified = pd_modified.astype("Float64") - - bf_result = scalars_df_index.equals(bf_modified) - pd_result = scalars_pandas_df_index.equals(pd_modified) - - assert pd_result == bf_result - - -def test_df_equals_different_values(scalars_df_index, scalars_pandas_df_index): - columns = ["int64_col", "int64_too"] - scalars_df_index = scalars_df_index[columns] - scalars_pandas_df_index = scalars_pandas_df_index[columns] - - bf_modified = scalars_df_index.copy() - bf_modified["int64_col"] = bf_modified.int64_col + 1 - - pd_modified = scalars_pandas_df_index.copy() - pd_modified["int64_col"] = pd_modified.int64_col + 1 - - bf_result = scalars_df_index.equals(bf_modified) - pd_result = scalars_pandas_df_index.equals(pd_modified) - - assert pd_result == bf_result - - -def test_df_equals_extra_column(scalars_df_index, scalars_pandas_df_index): - columns = ["int64_col", "int64_too"] - more_columns = ["int64_col", "int64_too", "float64_col"] - - bf_result = scalars_df_index[columns].equals(scalars_df_index[more_columns]) - pd_result = scalars_pandas_df_index[columns].equals( - scalars_pandas_df_index[more_columns] - ) - - assert pd_result == bf_result - - -def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index): - reindex_target_bf = scalars_df_index.reindex( - columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] - ) - bf_result = scalars_df_index.reindex_like(reindex_target_bf).to_pandas() - - reindex_target_pd = scalars_pandas_df_index.reindex( - columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] - ) - pd_result = scalars_pandas_df_index.reindex_like(reindex_target_pd) - - # Pandas uses float64 as default for newly created empty column, bf uses Float64 - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - # Pandas uses float64 as default for newly created empty column, bf uses Float64 - pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_values(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.values - - pd_result = scalars_pandas_df_index.values - # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_frame_equal( - pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False - ) - - -def test_df_to_numpy(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.to_numpy() - - pd_result = scalars_pandas_df_index.to_numpy() - # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_frame_equal( - pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False - ) - - -def test_df___array__(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.__array__() - - pd_result = scalars_pandas_df_index.__array__() - # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_frame_equal( - pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False - ) - - -@pytest.mark.parametrize( - ("key",), - [ - ("hello",), - (2,), - ("int64_col",), - (None,), - ], -) -def test_df_contains(scalars_df_index, scalars_pandas_df_index, key): - bf_result = key in scalars_df_index - pd_result = key in scalars_pandas_df_index - - assert bf_result == pd_result - - -def test_df_getattr_attribute_error_when_pandas_has(scalars_df_index): - # swapaxes is implemented in pandas but not in bigframes - with pytest.raises(AttributeError): - scalars_df_index.swapaxes() - - -def test_df_getattr_attribute_error(scalars_df_index): - with pytest.raises(AttributeError): - scalars_df_index.not_a_method() - - -def test_df_getattr_axes(): - df = dataframe.DataFrame( - [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] - ) - assert isinstance(df.index, bigframes.core.indexes.Index) - assert isinstance(df.columns, pandas.Index) - assert isinstance(df.my_column, series.Series) - - -def test_df_setattr_index(): - pd_df = pandas.DataFrame( - [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] - ) - bf_df = dataframe.DataFrame(pd_df) - - pd_df.index = pandas.Index([4, 5]) - bf_df.index = [4, 5] - - assert_pandas_df_equal( - pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False - ) - - -def test_df_setattr_columns(): - pd_df = pandas.DataFrame( - [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] - ) - bf_df = dataframe.DataFrame(pd_df) - - pd_df.columns = typing.cast(pandas.Index, pandas.Index([4, 5, 6])) - - bf_df.columns = pandas.Index([4, 5, 6]) - - assert_pandas_df_equal( - pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False - ) - - -def test_df_setattr_modify_column(): - pd_df = pandas.DataFrame( - [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] - ) - bf_df = dataframe.DataFrame(pd_df) - pd_df.my_column = [4, 5] - bf_df.my_column = [4, 5] - - assert_pandas_df_equal( - pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False - ) - - -def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): - index_list = scalars_pandas_df_index.string_col.iloc[[0, 1, 1, 5]].values - - scalars_df_index = scalars_df_index.set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") - - bf_result = scalars_df_index.loc[index_list].to_pandas() - pd_result = scalars_pandas_df_index.loc[index_list] - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): - index_list = [3, 2, 1, 3, 2, 1] - - bf_result = scalars_df_index.loc[index_list] - pd_result = scalars_pandas_df_index.loc[index_list] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_list_multiindex(scalars_dfs_maybe_ordered): - scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered - scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) - scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( - ["string_col", "int64_col"] - ) - index_list = [("Hello, World!", -234892), ("Hello, World!", 123456789)] - - bf_result = scalars_df_multiindex.loc[index_list] - pd_result = scalars_pandas_df_multiindex.loc[index_list] - - assert_dfs_equivalent( - pd_result, - bf_result, - ) - - -@pytest.mark.parametrize( - "index_list", - [ - [0, 1, 2, 3, 4, 4], - [0, 0, 0, 5, 4, 7, -2, -5, 3], - [-1, -2, -3, -4, -5, -5], - ], -) -def test_iloc_list(scalars_df_index, scalars_pandas_df_index, index_list): - bf_result = scalars_df_index.iloc[index_list] - pd_result = scalars_pandas_df_index.iloc[index_list] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -@pytest.mark.parametrize( - "index_list", - [ - [0, 1, 2, 3, 4, 4], - [0, 0, 0, 5, 4, 7, -2, -5, 3], - [-1, -2, -3, -4, -5, -5], - ], -) -def test_iloc_list_partial_ordering( - scalars_df_partial_ordering, scalars_pandas_df_index, index_list -): - bf_result = scalars_df_partial_ordering.iloc[index_list] - pd_result = scalars_pandas_df_index.iloc[index_list] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_iloc_list_multiindex(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.copy() - scalars_pandas_df = scalars_pandas_df.copy() - scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) - scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) - - index_list = [0, 0, 0, 5, 4, 7] - - bf_result = scalars_df.iloc[index_list] - pd_result = scalars_pandas_df.iloc[index_list] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): - - index_list: List[int] = [] - - bf_result = scalars_df_index.iloc[index_list] - pd_result = scalars_pandas_df_index.iloc[index_list] - - bf_result = bf_result.to_pandas() - assert bf_result.shape == pd_result.shape # types are known to be different - - -def test_rename_axis(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.rename_axis("newindexname") - pd_result = scalars_pandas_df_index.rename_axis("newindexname") - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_rename_axis_nonstring(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.rename_axis((4,)) - pd_result = scalars_pandas_df_index.rename_axis((4,)) - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): - pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - - scalars_df_index = scalars_df_index.set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") - - bf_result = scalars_df_index.loc[bf_string_series] - pd_result = scalars_pandas_df_index.loc[pd_string_series] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): - pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - - scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) - scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( - ["string_col", "int64_col"] - ) - - bf_result = scalars_df_multiindex.loc[bf_string_series] - pd_result = scalars_pandas_df_multiindex.loc[pd_string_series] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): - pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index - bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index - - bf_result = scalars_df_index.loc[bf_index] - pd_result = scalars_pandas_df_index.loc[pd_index] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_index_integer_index_renamed_col( - scalars_df_index, scalars_pandas_df_index -): - scalars_df_index = scalars_df_index.rename(columns={"int64_col": "rename"}) - scalars_pandas_df_index = scalars_pandas_df_index.rename( - columns={"int64_col": "rename"} - ) - - pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index - bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index - - bf_result = scalars_df_index.loc[bf_index] - pd_result = scalars_pandas_df_index.loc[pd_index] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -@pytest.mark.parametrize( - ("subset"), - [ - None, - "bool_col", - ["bool_col", "int64_too"], - ], -) -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - (False,), - ], -) -def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, subset): - columns = ["bool_col", "int64_too", "int64_col"] - bf_df = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() - pd_df = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) - pd.testing.assert_frame_equal( - pd_df, - bf_df, - ) - - -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - (False,), - ], -) -def test_df_drop_duplicates_w_json(json_df, keep): - bf_df = json_df.drop_duplicates(keep=keep).to_pandas() - - # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible - # with Arrow string extension types. Temporary conversion to standard Pandas - # strings is required. - json_pandas_df = json_df.to_pandas() - json_pandas_df["json_col"] = json_pandas_df["json_col"].astype( - pd.StringDtype(storage="pyarrow") - ) - - pd_df = json_pandas_df.drop_duplicates(keep=keep) - pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE) - pd.testing.assert_frame_equal( - pd_df, - bf_df, - ) - - -@pytest.mark.parametrize( - ("subset"), - [ - None, - ["bool_col"], - ], -) -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - (False,), - ], -) -def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): - columns = ["bool_col", "int64_too", "int64_col"] - bf_series = scalars_df_index[columns].duplicated(subset, keep=keep).to_pandas() - pd_series = scalars_pandas_df_index[columns].duplicated(subset, keep=keep) - pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) - - -def test_df_from_dict_columns_orient(): - data = {"a": [1, 2], "b": [3.3, 2.4]} - bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas() - pd_result = pd.DataFrame.from_dict(data, orient="columns") - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_df_from_dict_index_orient(): - data = {"a": [1, 2], "b": [3.3, 2.4]} - bf_result = dataframe.DataFrame.from_dict( - data, orient="index", columns=["col1", "col2"] - ).to_pandas() - pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"]) - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_df_from_dict_tight_orient(): - data = { - "index": [("i1", "i2"), ("i3", "i4")], - "columns": ["col1", "col2"], - "data": [[1, 2.6], [3, 4.5]], - "index_names": ["in1", "in2"], - "column_names": ["column_axis"], - } - - bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas() - pd_result = pd.DataFrame.from_dict(data, orient="tight") - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_df_from_records(): - records = ((1, "a"), (2.5, "b"), (3.3, "c"), (4.9, "d")) - - bf_result = dataframe.DataFrame.from_records( - records, columns=["c1", "c2"] - ).to_pandas() - pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"]) - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): - unsupported = ["numeric_col"] # formatted differently - bf_result = scalars_df_index.drop(columns=unsupported).to_dict() - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_dict() - - assert bf_result == pd_result - - -def test_df_to_excel(scalars_df_index, scalars_pandas_df_index): - unsupported = ["timestamp_col"] - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.drop(columns=unsupported).to_excel(bf_result_file) - scalars_pandas_df_index.drop(columns=unsupported).to_excel(pd_result_file) - bf_result = bf_result_file.read() - pd_result = bf_result_file.read() - - assert bf_result == pd_result - - -def test_df_to_latex(scalars_df_index, scalars_pandas_df_index): - unsupported = ["numeric_col"] # formatted differently - bf_result = scalars_df_index.drop(columns=unsupported).to_latex() - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_latex() - - assert bf_result == pd_result - - -def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.to_json() - # default_handler for arrow types that have no default conversion - pd_result = scalars_pandas_df_index.to_json(default_handler=str) - - assert bf_result == pd_result - - -def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - # duration not fully supported at pandas level - scalars_df_index = scalars_df_index.drop(columns="duration_col") - scalars_pandas_df_index = scalars_pandas_df_index.drop(columns="duration_col") - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.to_json(bf_result_file, orient="table") - # default_handler for arrow types that have no default conversion - scalars_pandas_df_index.to_json( - pd_result_file, orient="table", default_handler=str - ) - - bf_result = bf_result_file.read() - pd_result = pd_result_file.read() - - assert bf_result == pd_result - - -def test_df_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.to_csv() - # default_handler for arrow types that have no default conversion - pd_result = scalars_pandas_df_index.to_csv() - - assert bf_result == pd_result - - -def test_df_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.to_csv(bf_result_file) - scalars_pandas_df_index.to_csv(pd_result_file) - - bf_result = bf_result_file.read() - pd_result = pd_result_file.read() - - assert bf_result == pd_result - - -def test_df_to_parquet_local_bytes(scalars_df_index, scalars_pandas_df_index): - # GEOGRAPHY not supported in parquet export. - unsupported = ["geography_col"] - - bf_result = scalars_df_index.drop(columns=unsupported).to_parquet() - # default_handler for arrow types that have no default conversion - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_parquet() - - assert bf_result == pd_result - - -def test_df_to_parquet_local_file(scalars_df_index, scalars_pandas_df_index): - # GEOGRAPHY not supported in parquet export. - unsupported = ["geography_col"] - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.drop(columns=unsupported).to_parquet(bf_result_file) - scalars_pandas_df_index.drop(columns=unsupported).to_parquet(pd_result_file) - - bf_result = bf_result_file.read() - pd_result = pd_result_file.read() - - assert bf_result == pd_result - - -def test_df_to_records(scalars_df_index, scalars_pandas_df_index): - unsupported = ["numeric_col"] - bf_result = scalars_df_index.drop(columns=unsupported).to_records() - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_records() - - for bfi, pdi in zip(bf_result, pd_result): - for bfj, pdj in zip(bfi, pdi): - assert pd.isna(bfj) and pd.isna(pdj) or bfj == pdj - - -def test_df_to_string(scalars_df_index, scalars_pandas_df_index): - unsupported = ["numeric_col"] # formatted differently - - bf_result = scalars_df_index.drop(columns=unsupported).to_string() - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_string() - - assert bf_result == pd_result - - -def test_df_to_html(scalars_df_index, scalars_pandas_df_index): - unsupported = ["numeric_col"] # formatted differently - - bf_result = scalars_df_index.drop(columns=unsupported).to_html() - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_html() - - assert bf_result == pd_result - - -def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index): - # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231 - bf_result = scalars_df_index.dropna().to_markdown() - pd_result = scalars_pandas_df_index.dropna().to_markdown() - - assert bf_result == pd_result - - -def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index): - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.to_pickle(bf_result_file) - scalars_pandas_df_index.to_pickle(pd_result_file) - bf_result = bf_result_file.read() - pd_result = pd_result_file.read() - - assert bf_result == pd_result - - -def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): - unsupported = [ - "numeric_col", - "bytes_col", - "date_col", - "datetime_col", - "time_col", - "timestamp_col", - "geography_col", - "duration_col", - ] - - bf_result_file = tempfile.TemporaryFile() - pd_result_file = tempfile.TemporaryFile() - scalars_df_index.drop(columns=unsupported).to_orc(bf_result_file) - scalars_pandas_df_index.drop(columns=unsupported).reset_index().to_orc( - pd_result_file - ) - bf_result = bf_result_file.read() - pd_result = bf_result_file.read() - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("expr",), - [ - ("new_col = int64_col + int64_too",), - ("new_col = (rowindex > 3) | bool_col",), - ("int64_too = bool_col\nnew_col2 = rowindex",), - ], -) -def test_df_eval(scalars_dfs, expr): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.eval(expr).to_pandas() - pd_result = scalars_pandas_df.eval(expr) - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("expr",), - [ - ("int64_col > int64_too",), - ("bool_col",), - ("((int64_col - int64_too) % @local_var) == 0",), - ], -) -def test_df_query(scalars_dfs, expr): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - # local_var is referenced in expressions - local_var = 3 # NOQA - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.query(expr).to_pandas() - pd_result = scalars_pandas_df.query(expr) - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("subset", "normalize", "ascending", "dropna"), - [ - (None, False, False, False), - (None, True, True, True), - ("bool_col", True, False, True), - ], -) -def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): - if pd.__version__.startswith("1."): - pytest.skip("pandas 1.x produces different column labels.") - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = ( - scalars_df[["string_col", "bool_col"]] - .value_counts(subset, normalize=normalize, ascending=ascending, dropna=dropna) - .to_pandas() - ) - pd_result = scalars_pandas_df[["string_col", "bool_col"]].value_counts( - subset, normalize=normalize, ascending=ascending, dropna=dropna - ) - - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("na_option", "method", "ascending", "numeric_only", "pct"), - [ - ("keep", "average", True, True, True), - ("top", "min", False, False, False), - ("bottom", "max", False, False, True), - ("top", "first", False, False, False), - ("bottom", "dense", False, False, True), - ], -) -def test_df_rank_with_nulls( - scalars_df_index, - scalars_pandas_df_index, - na_option, - method, - ascending, - numeric_only, - pct, -): - unsupported_columns = ["geography_col"] - bf_result = ( - scalars_df_index.drop(columns=unsupported_columns) - .rank( - na_option=na_option, - method=method, - ascending=ascending, - numeric_only=numeric_only, - pct=pct, - ) - .to_pandas() - ) - pd_result = ( - scalars_pandas_df_index.drop(columns=unsupported_columns) - .rank( - na_option=na_option, - method=method, - ascending=ascending, - numeric_only=numeric_only, - pct=pct, - ) - .astype(pd.Float64Dtype()) - ) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_bool_interpretation_error(scalars_df_index): - with pytest.raises(ValueError): - True if scalars_df_index else False - - -def test_query_job_setters(scalars_df_default_index: dataframe.DataFrame): - # if allow_large_results=False, might not create query job - with bigframes.option_context("compute.allow_large_results", True): - job_ids = set() - repr(scalars_df_default_index) - assert scalars_df_default_index.query_job is not None - job_ids.add(scalars_df_default_index.query_job.job_id) - scalars_df_default_index.to_pandas(allow_large_results=True) - job_ids.add(scalars_df_default_index.query_job.job_id) - - assert len(job_ids) == 2 - - -def test_df_cached(scalars_df_index): - df = scalars_df_index.set_index(["int64_too", "int64_col"]).sort_values( - "string_col" - ) - df = df[df["rowindex_2"] % 2 == 0] - - df_cached_copy = df.cache() - pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) - - -def test_df_cached_many_index_cols(scalars_df_index): - index_cols = [ - "int64_too", - "time_col", - "int64_col", - "bool_col", - "date_col", - "timestamp_col", - "string_col", - ] - df = scalars_df_index.set_index(index_cols) - df = df[df["rowindex_2"] % 2 == 0] - - df_cached_copy = df.cache() - pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) - - -def test_assign_after_binop_row_joins(): - pd_df = pd.DataFrame( - { - "idx1": [1, 1, 1, 1, 2, 2, 2, 2], - "idx2": [10, 10, 20, 20, 10, 10, 20, 20], - "metric1": [10, 14, 2, 13, 6, 2, 9, 5], - "metric2": [25, -3, 8, 2, -1, 0, 0, -4], - }, - dtype=pd.Int64Dtype(), - ).set_index(["idx1", "idx2"]) - bf_df = dataframe.DataFrame(pd_df) - - # Expect implicit joiner to be used, preserving input cardinality rather than getting relational join - bf_df["metric_diff"] = bf_df.metric1 - bf_df.metric2 - pd_df["metric_diff"] = pd_df.metric1 - pd_df.metric2 - - assert_pandas_df_equal(bf_df.to_pandas(), pd_df) - - -def test_df_cache_with_implicit_join(scalars_df_index): - """expectation is that cache will be used, but no explicit join will be performed""" - df = scalars_df_index[["int64_col", "int64_too"]].sort_index().reset_index() + 3 - df.cache() - bf_result = df + (df * 2) - sql = bf_result.sql - - # Very crude asserts, want sql to not use join and not use base table, only reference cached table - assert "JOIN" not in sql - assert "bigframes_testing" not in sql - - -def test_df_dot_inline(session): - df1 = pd.DataFrame([[1, 2, 3], [2, 5, 7]]) - df2 = pd.DataFrame([[2, 4, 8], [1, 5, 10], [3, 6, 9]]) - - bf1 = session.read_pandas(df1) - bf2 = session.read_pandas(df2) - bf_result = bf1.dot(bf2).to_pandas() - pd_result = df1.dot(df2) - - # Patch pandas dtypes for testing parity - # Pandas uses int64 instead of Int64 (nullable) dtype. - for name in pd_result.columns: - pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_dot( - matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df -): - bf_result = matrix_2by3_df.dot(matrix_3by4_df).to_pandas() - pd_result = matrix_2by3_pandas_df.dot(matrix_3by4_pandas_df) - - # Patch pandas dtypes for testing parity - # Pandas result is object instead of Int64 (nullable) dtype. - for name in pd_result.columns: - pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_dot_operator( - matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df -): - bf_result = (matrix_2by3_df @ matrix_3by4_df).to_pandas() - pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df - - # Patch pandas dtypes for testing parity - # Pandas result is object instead of Int64 (nullable) dtype. - for name in pd_result.columns: - pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_dot_series_inline(): - left = [[1, 2, 3], [2, 5, 7]] - right = [2, 1, 3] - - bf1 = dataframe.DataFrame(left) - bf2 = series.Series(right) - bf_result = bf1.dot(bf2).to_pandas() - - df1 = pd.DataFrame(left) - df2 = pd.Series(right) - pd_result = df1.dot(df2) - - # Patch pandas dtypes for testing parity - # Pandas result is int64 instead of Int64 (nullable) dtype. - pd_result = pd_result.astype(pd.Int64Dtype()) - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_df_dot_series( - matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df -): - bf_result = matrix_2by3_df.dot(matrix_3by4_df["x"]).to_pandas() - pd_result = matrix_2by3_pandas_df.dot(matrix_3by4_pandas_df["x"]) - - # Patch pandas dtypes for testing parity - # Pandas result is object instead of Int64 (nullable) dtype. - pd_result = pd_result.astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_df_dot_operator_series( - matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df -): - bf_result = (matrix_2by3_df @ matrix_3by4_df["x"]).to_pandas() - pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df["x"] - - # Patch pandas dtypes for testing parity - # Pandas result is object instead of Int64 (nullable) dtype. - pd_result = pd_result.astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -# TODO(tswast): We may be able to re-enable this test after we break large -# queries up in https://github.com/googleapis/python-bigquery-dataframes/pull/427 -@pytest.mark.skipif( - sys.version_info >= (3, 12), - # See: https://github.com/python/cpython/issues/112282 - reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", -) -def test_recursion_limit(scalars_df_index): - scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]] - for i in range(400): - scalars_df_index = scalars_df_index + 4 - scalars_df_index.to_pandas() - - -@pytest.mark.skipif( - reason="b/366477265: Skip until query complexity error can be reliably triggered." -) -def test_query_complexity_error(scalars_df_index): - # This test requires automatic caching/query decomposition to be turned off - bf_df = scalars_df_index - for _ in range(8): - bf_df = bf_df.merge(bf_df, on="int64_col").head(30) - bf_df = bf_df[bf_df.columns[:20]] - - with pytest.raises( - bigframes.exceptions.QueryComplexityError, match=r"Try using DataFrame\.cache" - ): - bf_df.to_pandas() - - -def test_query_complexity_repeated_joins( - scalars_df_index, scalars_pandas_df_index, with_multiquery_execution -): - pd_df = scalars_pandas_df_index - bf_df = scalars_df_index - for _ in range(8): - # recursively join, resuling in 2^8 - 1 = 255 joins - pd_df = pd_df.merge(pd_df, on="int64_col").head(30) - pd_df = pd_df[pd_df.columns[:20]] - bf_df = bf_df.merge(bf_df, on="int64_col").head(30) - bf_df = bf_df[bf_df.columns[:20]] - - bf_result = bf_df.to_pandas() - pd_result = pd_df - assert_pandas_df_equal(bf_result, pd_result, check_index_type=False) - - -def test_query_complexity_repeated_subtrees( - scalars_df_index, scalars_pandas_df_index, with_multiquery_execution -): - # Recursively union the data, if fully inlined has 10^5 identical root tables. - pd_df = scalars_pandas_df_index - bf_df = scalars_df_index - for _ in range(5): - pd_df = pd.concat(10 * [pd_df]).head(5) - bf_df = bpd.concat(10 * [bf_df]).head(5) - bf_result = bf_df.to_pandas() - pd_result = pd_df - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.skipif( - sys.version_info >= (3, 12), - # See: https://github.com/python/cpython/issues/112282 - reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", -) -def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_index): - bf_df = scalars_df_index[["int64_col", "int64_too"]] - pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]] - # Uses LAG analytic operator, each in a new SELECT - for _ in range(50): - bf_df = bf_df.diff() - pd_df = pd_df.diff() - bf_result = bf_df.to_pandas() - pd_result = pd_df - assert_pandas_df_equal(bf_result, pd_result) - - -def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created): - dataset_id = dataset_id_not_created - destination_table = f"{dataset_id}.scalars_df" - - result_table = scalars_df_index.to_gbq(destination_table) - assert ( - result_table == destination_table - if destination_table - else result_table is not None - ) - - loaded_scalars_df_index = session.read_gbq(result_table) - assert not loaded_scalars_df_index.empty - - -def test_read_gbq_to_pandas_no_exec(unordered_session: bigframes.Session): - metrics = unordered_session._metrics - execs_pre = metrics.execution_count - df = unordered_session.read_gbq("bigquery-public-data.ml_datasets.penguins") - df.to_pandas() - execs_post = metrics.execution_count - assert df.shape == (344, 7) - assert execs_pre == execs_post - - -def test_to_gbq_table_labels(scalars_df_index): - destination_table = "bigframes-dev.bigframes_tests_sys.table_labels" - result_table = scalars_df_index.to_gbq( - destination_table, labels={"test": "labels"}, if_exists="replace" - ) - client = scalars_df_index._session.bqclient - table = client.get_table(result_table) - assert table.labels - assert table.labels["test"] == "labels" - - -@pytest.mark.parametrize( - ("col_names", "ignore_index"), - [ - pytest.param(["A"], False, id="one_array_false"), - pytest.param(["A"], True, id="one_array_true"), - pytest.param(["B"], False, id="one_float_false"), - pytest.param(["B"], True, id="one_float_true"), - pytest.param(["A", "C"], False, id="two_arrays_false"), - pytest.param(["A", "C"], True, id="two_arrays_true"), - ], -) -def test_dataframe_explode(col_names, ignore_index, session): - data = { - "A": [[0, 1, 2], [], [3, 4]], - "B": 3, - "C": [["a", "b", "c"], np.nan, ["d", "e"]], - } - - metrics = session._metrics - df = bpd.DataFrame(data, session=session) - pd_df = df.to_pandas() - pd_result = pd_df.explode(col_names, ignore_index=ignore_index) - bf_result = df.explode(col_names, ignore_index=ignore_index) - - # Check that to_pandas() results in at most a single query execution - execs_pre = metrics.execution_count - bf_materialized = bf_result.to_pandas() - execs_post = metrics.execution_count - - pd.testing.assert_frame_equal( - bf_materialized, - pd_result, - check_index_type=False, - check_dtype=False, - ) - # we test this property on this method in particular as compilation - # is non-deterministic and won't use the query cache as implemented - assert execs_post - execs_pre <= 1 - - -@pytest.mark.parametrize( - ("ignore_index", "ordered"), - [ - pytest.param(True, True, id="include_index_ordered"), - pytest.param(True, False, id="include_index_unordered"), - pytest.param(False, True, id="ignore_index_ordered"), - ], -) -def test_dataframe_explode_reserve_order(ignore_index, ordered): - data = { - "a": [np.random.randint(0, 10, 10) for _ in range(10)], - "b": [np.random.randint(0, 10, 10) for _ in range(10)], - } - df = bpd.DataFrame(data) - pd_df = pd.DataFrame(data) - - res = df.explode(["a", "b"], ignore_index=ignore_index).to_pandas(ordered=ordered) - pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype( - pd.Int64Dtype() - ) - pd.testing.assert_frame_equal( - res if ordered else res.sort_index(), - pd_res, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - ("col_names"), - [ - pytest.param([], id="empty", marks=pytest.mark.xfail(raises=ValueError)), - pytest.param( - ["A", "A"], id="duplicate", marks=pytest.mark.xfail(raises=ValueError) - ), - pytest.param("unknown", id="unknown", marks=pytest.mark.xfail(raises=KeyError)), - ], -) -def test_dataframe_explode_xfail(col_names): - df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) - df.explode(col_names) - - -@pytest.mark.parametrize( - ("on", "rule", "origin"), - [ - pytest.param("datetime_col", "100D", "start"), - pytest.param("datetime_col", "30W", "start"), - pytest.param("datetime_col", "5M", "epoch"), - pytest.param("datetime_col", "3Q", "start_day"), - pytest.param("datetime_col", "3YE", "start"), - pytest.param( - "int64_col", "100D", "start", marks=pytest.mark.xfail(raises=TypeError) - ), - pytest.param( - "datetime_col", "100D", "end", marks=pytest.mark.xfail(raises=ValueError) - ), - ], -) -def test__resample_with_column( - scalars_df_index, scalars_pandas_df_index, on, rule, origin -): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - bf_result = ( - scalars_df_index._resample(rule=rule, on=on, origin=origin)[ - ["int64_col", "int64_too"] - ] - .max() - .to_pandas() - ) - pd_result = scalars_pandas_df_index.resample(rule=rule, on=on, origin=origin)[ - ["int64_col", "int64_too"] - ].max() - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("append", "level", "col", "rule"), - [ - pytest.param(False, None, "timestamp_col", "100d"), - pytest.param(True, 1, "timestamp_col", "1200h"), - pytest.param(False, None, "datetime_col", "100d"), - ], -) -def test__resample_with_index( - scalars_df_index, scalars_pandas_df_index, append, level, col, rule -): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df_index = scalars_df_index.set_index(col, append=append) - scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append) - bf_result = ( - scalars_df_index[["int64_col", "int64_too"]] - ._resample(rule=rule, level=level) - .min() - .to_pandas() - ) - pd_result = ( - scalars_pandas_df_index[["int64_col", "int64_too"]] - .resample(rule=rule, level=level) - .min() - ) - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("rule", "origin", "data"), - [ - ( - "5h", - "epoch", - { - "timestamp_col": pd.date_range( - start="2021-01-01 13:00:00", periods=30, freq="1h" - ), - "int64_col": range(30), - "int64_too": range(10, 40), - }, - ), - ( - "75min", - "start_day", - { - "timestamp_col": pd.date_range( - start="2021-01-01 13:00:00", periods=30, freq="10min" - ), - "int64_col": range(30), - "int64_too": range(10, 40), - }, - ), - ( - "7s", - "epoch", - { - "timestamp_col": pd.date_range( - start="2021-01-01 13:00:00", periods=30, freq="1s" - ), - "int64_col": range(30), - "int64_too": range(10, 40), - }, - ), - ], -) -def test__resample_start_time(rule, origin, data): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - col = "timestamp_col" - scalars_df_index = bpd.DataFrame(data).set_index(col) - scalars_pandas_df_index = pd.DataFrame(data).set_index(col) - scalars_pandas_df_index.index.name = None - - bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() - - pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() - - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - "dtype", - [ - pytest.param("string[pyarrow]", id="type-string"), - pytest.param(pd.StringDtype(storage="pyarrow"), id="type-literal"), - pytest.param( - {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()}, - id="multiple-types", - ), - ], -) -def test_df_astype(scalars_dfs, dtype): - bf_df, pd_df = scalars_dfs - target_cols = ["bool_col", "int64_col"] - bf_df = bf_df[target_cols] - pd_df = pd_df[target_cols] - - bf_result = bf_df.astype(dtype).to_pandas() - pd_result = pd_df.astype(dtype) - - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -def test_df_astype_python_types(scalars_dfs): - bf_df, pd_df = scalars_dfs - target_cols = ["bool_col", "int64_col"] - bf_df = bf_df[target_cols] - pd_df = pd_df[target_cols] - - bf_result = bf_df.astype({"bool_col": str, "int64_col": float}).to_pandas() - pd_result = pd_df.astype( - {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()} - ) - - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -def test_astype_invalid_type_fail(scalars_dfs): - bf_df, _ = scalars_dfs - - with pytest.raises(TypeError, match=r".*Share your use case with.*"): - bf_df.astype(123) - - -def test_agg_with_dict_lists_strings(scalars_dfs): - bf_df, pd_df = scalars_dfs - agg_funcs = { - "int64_too": ["min", "max"], - "int64_col": ["min", "count"], - } - - bf_result = bf_df.agg(agg_funcs).to_pandas() - pd_result = pd_df.agg(agg_funcs) - - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_agg_with_dict_lists_callables(scalars_dfs): - bf_df, pd_df = scalars_dfs - agg_funcs = { - "int64_too": [np.min, np.max], - "int64_col": [np.min, np.var], - } - - bf_result = bf_df.agg(agg_funcs).to_pandas() - pd_result = pd_df.agg(agg_funcs) - - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_agg_with_dict_list_and_str(scalars_dfs): - bf_df, pd_df = scalars_dfs - agg_funcs = { - "int64_too": ["min", "max"], - "int64_col": "sum", - } - - bf_result = bf_df.agg(agg_funcs).to_pandas() - pd_result = pd_df.agg(agg_funcs) - - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_agg_with_dict_strs(scalars_dfs): - bf_df, pd_df = scalars_dfs - agg_funcs = { - "int64_too": "min", - "int64_col": "sum", - "float64_col": "max", - } - - bf_result = bf_df.agg(agg_funcs).to_pandas() - pd_result = pd_df.agg(agg_funcs) - pd_result.index = pd_result.index.astype("string[pyarrow]") - - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): - bf_df, _ = scalars_dfs - agg_funcs = { - "int64_too": ["min", "max"], - "nonexisting_col": ["count"], - } - - with pytest.raises(KeyError): - bf_df.agg(agg_funcs) + # Verify the result is a string representation + assert isinstance(result["json_col"].iloc[0], str) From 30a9ef621e903109e0dcc213940a097e9d415afc Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 22:29:28 +0000 Subject: [PATCH 29/37] Revert scalar_op_registry.py chnage --- bigframes/core/compile/ibis_compiler/scalar_op_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 7b17aac61a..e983fc7e21 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1036,7 +1036,7 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): if to_type == ibis_dtypes.bool: return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) if to_type == ibis_dtypes.string: - return to_json_string(x) + return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) # TODO: either inline this function, or push rest of this op into the function return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) From 6895def33e6a43577f1908c7b2c171d7b94e87ca Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 22:31:53 +0000 Subject: [PATCH 30/37] remove unnecessary import --- bigframes/dataframe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 788a47f38b..f3b78e8218 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -852,9 +852,7 @@ def _repr_html_(self) -> str: if opts.repr_mode == "anywidget": try: - import anywidget # noqa: F401 from IPython.display import display as ipython_display - import traitlets # noqa: F401 from bigframes import display From 46444c11ec6148f0ec595a44f0fefcc91ad802d0 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 22:47:44 +0000 Subject: [PATCH 31/37] Remove duplicate conversation --- bigframes/display/anywidget.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index ff5a51f312..cf5d4e6310 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -74,21 +74,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget." ) - super().__init__() - # Workaround for Arrow bug https://github.com/apache/arrow/issues/45262 - # JSON columns are not supported in `to_pandas_batches` and will be converted to string. - json_cols = [ - col - for col, dtype in dataframe.dtypes.items() - if dtype == bigframes.dtypes.JSON_DTYPE - ] - if json_cols: - df_copy = dataframe.copy() - for col in json_cols: - df_copy[str(col)] = df_copy[str(col)].astype("string") - self._dataframe = df_copy - else: - self._dataframe = dataframe + self._dataframe = dataframe super().__init__() From 3b8367b3fc74abaf72d5b246b7038ecc6d9a763e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 22:52:55 +0000 Subject: [PATCH 32/37] revert changes to test_dataframe.py --- tests/system/small/test_dataframe.py | 6151 +++++++++++++++++++++++++- 1 file changed, 6142 insertions(+), 9 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ffd9bc512b..79f8efd00f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1,11 +1,6144 @@ -def test_to_pandas_batches_with_json_columns(session): - """Test that JSON columns are properly handled in to_pandas_batches.""" - # Create a DataFrame with JSON column - df = session.read_gbq('SELECT JSON \'{"key": "value"}\' as json_col') +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. - # This should not raise an error - batches = df._to_pandas_batches(page_size=10) - result = next(batches) +import io +import operator +import sys +import tempfile +import typing +from typing import Dict, List, Tuple - # Verify the result is a string representation - assert isinstance(result["json_col"].iloc[0], str) +import geopandas as gpd # type: ignore +import numpy as np +import pandas as pd +import pandas.testing +import pyarrow as pa # type: ignore +import pytest + +import bigframes +import bigframes._config.display_options as display_options +import bigframes.core.indexes as bf_indexes +import bigframes.dataframe as dataframe +import bigframes.dtypes as dtypes +import bigframes.pandas as bpd +import bigframes.series as series +from bigframes.testing.utils import ( + assert_dfs_equivalent, + assert_pandas_df_equal, + assert_series_equal, + assert_series_equivalent, +) + + +def test_df_construct_copy(scalars_dfs): + columns = ["int64_col", "string_col", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + # Make the mapping from label to col_id non-trivial + bf_df = scalars_df.copy() + bf_df["int64_col"] = bf_df["int64_col"] / 2 + pd_df = scalars_pandas_df.copy() + pd_df["int64_col"] = pd_df["int64_col"] / 2 + + bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas() + + pd_result = pd.DataFrame(pd_df, columns=columns) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_construct_pandas_default(scalars_dfs): + # This should trigger the inlined codepath + columns = [ + "int64_too", + "int64_col", + "float64_col", + "bool_col", + "string_col", + "date_col", + "datetime_col", + "numeric_col", + "float64_col", + "time_col", + "timestamp_col", + ] + _, scalars_pandas_df = scalars_dfs + bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns).to_pandas() + pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("write_engine"), + [ + ("bigquery_inline"), + ("bigquery_load"), + ("bigquery_streaming"), + ("bigquery_write"), + ], +) +def test_read_pandas_all_nice_types( + session: bigframes.Session, scalars_pandas_df_index: pd.DataFrame, write_engine +): + bf_result = session.read_pandas( + scalars_pandas_df_index, write_engine=write_engine + ).to_pandas() + pandas.testing.assert_frame_equal(bf_result, scalars_pandas_df_index) + + +def test_df_construct_large_strings(): + data = [["hello", "w" + "o" * 50000 + "rld"]] + bf_result = dataframe.DataFrame(data).to_pandas() + pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow")) + pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_df_construct_pandas_load_job(scalars_dfs_maybe_ordered): + # This should trigger the inlined codepath + columns = [ + "int64_too", + "int64_col", + "float64_col", + "bool_col", + "string_col", + "date_col", + "datetime_col", + "numeric_col", + "float64_col", + "time_col", + "timestamp_col", + "geography_col", + ] + _, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns) + pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) + assert_dfs_equivalent(pd_result, bf_result) + + +def test_df_construct_structs(session): + pd_frame = pd.Series( + [ + {"version": 1, "project": "pandas"}, + {"version": 2, "project": "pandas"}, + {"version": 1, "project": "numpy"}, + ] + ).to_frame() + bf_series = session.read_pandas(pd_frame) + pd.testing.assert_frame_equal( + bf_series.to_pandas(), pd_frame, check_index_type=False, check_dtype=False + ) + + +def test_df_construct_local_concat_pd(scalars_pandas_df_index, session): + pd_df = pd.concat([scalars_pandas_df_index, scalars_pandas_df_index]) + + bf_df = session.read_pandas(pd_df) + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), pd_df, check_index_type=False, check_dtype=False + ) + + +def test_df_construct_pandas_set_dtype(scalars_dfs): + columns = [ + "int64_too", + "int64_col", + "float64_col", + "bool_col", + ] + _, scalars_pandas_df = scalars_dfs + bf_result = dataframe.DataFrame( + scalars_pandas_df, columns=columns, dtype="Float64" + ).to_pandas() + pd_result = pd.DataFrame(scalars_pandas_df, columns=columns, dtype="Float64") + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_construct_from_series(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = dataframe.DataFrame( + {"a": scalars_df["int64_col"], "b": scalars_df["string_col"]}, + dtype="string[pyarrow]", + ) + pd_result = pd.DataFrame( + {"a": scalars_pandas_df["int64_col"], "b": scalars_pandas_df["string_col"]}, + dtype="string[pyarrow]", + ) + assert_dfs_equivalent(pd_result, bf_result) + + +def test_df_construct_from_dict(): + input_dict = { + "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + # With a space in column name. We use standardized SQL schema ids to solve the problem that BQ schema doesn't support column names with spaces. b/296751058 + "Max Speed": [380.0, 370.0, 24.0, 26.0], + } + bf_result = dataframe.DataFrame(input_dict).to_pandas() + pd_result = pd.DataFrame(input_dict) + + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("json_type"), + [ + pytest.param(dtypes.JSON_DTYPE), + pytest.param("json"), + ], +) +def test_df_construct_w_json_dtype(json_type): + data = [ + "1", + "false", + '["a", {"b": 1}, null]', + None, + ] + df = dataframe.DataFrame({"json_col": data}, dtype=json_type) + + assert df["json_col"].dtype == dtypes.JSON_DTYPE + assert df["json_col"][1] == "false" + + +def test_df_construct_inline_respects_location(reset_default_session_and_location): + # Note: This starts a thread-local session. + with bpd.option_context("bigquery.location", "europe-west1"): + df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) + df.to_gbq() + assert df.query_job is not None + table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) + + assert table.location == "europe-west1" + + +def test_df_construct_dtype(): + data = { + "int_col": [1, 2, 3], + "string_col": ["1.1", "2.0", "3.5"], + "float_col": [1.0, 2.0, 3.0], + } + dtype = pd.StringDtype(storage="pyarrow") + bf_result = dataframe.DataFrame(data, dtype=dtype) + pd_result = pd.DataFrame(data, dtype=dtype) + pd_result.index = pd_result.index.astype("Int64") + pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_get_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + series = scalars_df[col_name] + bf_result = series.to_pandas() + pd_result = scalars_pandas_df[col_name] + assert_series_equal(bf_result, pd_result) + + +def test_get_column_nonstring(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df.rename(columns={"int64_col": 123.1})[123.1] + bf_result = series.to_pandas() + pd_result = scalars_pandas_df.rename(columns={"int64_col": 123.1})[123.1] + assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + "row_slice", + [ + (slice(1, 7, 2)), + (slice(1, 7, None)), + (slice(None, -3, None)), + ], +) +def test_get_rows_with_slice(scalars_dfs, row_slice): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[row_slice].to_pandas() + pd_result = scalars_pandas_df[row_slice] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_hasattr(scalars_dfs): + scalars_df, _ = scalars_dfs + assert hasattr(scalars_df, "int64_col") + assert hasattr(scalars_df, "head") + assert not hasattr(scalars_df, "not_exist") + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_head_with_custom_column_labels( + scalars_df_index, scalars_pandas_df_index, ordered +): + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) + bf_result = bf_df.to_pandas(ordered=ordered) + pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + bf_df = scalars_df_index.rename(columns=rename_mapping).tail(3) + bf_result = bf_df.to_pandas() + pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).tail(3) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + ("all",), + ], +) +def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep): + bf_result = scalars_df_index.nlargest(3, ["bool_col", "int64_too"], keep=keep) + pd_result = scalars_pandas_df_index.nlargest( + 3, ["bool_col", "int64_too"], keep=keep + ) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + ("all",), + ], +) +def test_df_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): + bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep) + pd_result = scalars_pandas_df_index.nsmallest(6, ["bool_col"], keep=keep) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_get_column_by_attr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df.int64_col + bf_result = series.to_pandas() + pd_result = scalars_pandas_df.int64_col + assert_series_equal(bf_result, pd_result) + + +def test_get_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["bool_col", "float64_col", "int64_col"] + df_subset = scalars_df.get(col_names) + df_pandas = df_subset.to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df[col_names].columns + ) + + +def test_get_columns_default(scalars_dfs): + scalars_df, _ = scalars_dfs + col_names = ["not", "column", "names"] + result = scalars_df.get(col_names, "default_val") + assert result == "default_val" + + +@pytest.mark.parametrize( + ("loc", "column", "value", "allow_duplicates"), + [ + (0, 666, 2, False), + (5, "float64_col", 2.2, True), + (13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True), + pytest.param( + 14, + "test", + 2, + False, + marks=pytest.mark.xfail( + raises=IndexError, + ), + ), + pytest.param( + 12, + "int64_col", + 2, + False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], +) +def test_insert(scalars_dfs, loc, column, value, allow_duplicates): + scalars_df, scalars_pandas_df = scalars_dfs + # insert works inplace, so will influence other tests. + # make a copy to avoid inplace changes. + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.insert(loc, column, value, allow_duplicates) + pd_df.insert(loc, column, value, allow_duplicates) + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) + + +def test_mask_series_cond(scalars_df_index, scalars_pandas_df_index): + cond_bf = scalars_df_index["int64_col"] > 0 + cond_pd = scalars_pandas_df_index["int64_col"] > 0 + + bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] + bf_result = bf_df.mask(cond_bf, bf_df + 1).to_pandas() + pd_result = pd_df.mask(cond_pd, pd_df + 1) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_mask_callable(scalars_df_index, scalars_pandas_df_index): + def is_positive(x): + return x > 0 + + bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] + bf_result = bf_df.mask(cond=is_positive, other=lambda x: x + 1).to_pandas() + pd_result = pd_df.mask(cond=is_positive, other=lambda x: x + 1) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_multi_column(scalars_df_index, scalars_pandas_df_index): + # Test when a dataframe has multi-columns. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + + dataframe_bf.columns = pd.MultiIndex.from_tuples( + [("str1", 1), ("str2", 2)], names=["STR", "INT"] + ) + cond_bf = dataframe_bf["str1"] > 0 + + with pytest.raises(NotImplementedError) as context: + dataframe_bf.where(cond_bf).to_pandas() + assert ( + str(context.value) + == "The dataframe.where() method does not support multi-column." + ) + + +def test_where_series_cond(scalars_df_index, scalars_pandas_df_index): + # Condition is dataframe, other is None (as default). + cond_bf = scalars_df_index["int64_col"] > 0 + cond_pd = scalars_pandas_df_index["int64_col"] > 0 + bf_result = scalars_df_index.where(cond_bf).to_pandas() + pd_result = scalars_pandas_df_index.where(cond_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_series_cond_const_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a series, other is a constant. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + dataframe_bf.columns.name = "test_name" + dataframe_pd.columns.name = "test_name" + + cond_bf = dataframe_bf["int64_col"] > 0 + cond_pd = dataframe_pd["int64_col"] > 0 + other = 0 + + bf_result = dataframe_bf.where(cond_bf, other).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_series_cond_dataframe_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a series, other is a dataframe. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf["int64_col"] > 0 + cond_pd = dataframe_pd["int64_col"] > 0 + other_bf = -dataframe_bf + other_pd = -dataframe_pd + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond(scalars_df_index, scalars_pandas_df_index): + # Condition is a dataframe, other is None. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + + bf_result = dataframe_bf.where(cond_bf, None).to_pandas() + pd_result = dataframe_pd.where(cond_pd, None) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond_const_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a dataframe, other is a constant. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + other_bf = 10 + other_pd = 10 + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond_dataframe_other( + scalars_df_index, scalars_pandas_df_index +): + # Condition is a dataframe, other is a dataframe. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + other_bf = dataframe_bf * 2 + other_pd = dataframe_pd * 2 + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_callable_cond_constant_other(scalars_df_index, scalars_pandas_df_index): + # Condition is callable, other is a constant. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + other = 10 + + bf_result = dataframe_bf.where(lambda x: x > 0, other).to_pandas() + pd_result = dataframe_pd.where(lambda x: x > 0, other) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond_callable_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a dataframe, other is callable. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + + def func(x): + return x * 2 + + bf_result = dataframe_bf.where(cond_bf, func).to_pandas() + pd_result = dataframe_pd.where(cond_pd, func) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_callable_cond_callable_other(scalars_df_index, scalars_pandas_df_index): + # Condition is callable, other is callable too. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + def func(x): + return x["int64_col"] > 0 + + bf_result = dataframe_bf.where(func, lambda x: x * 2).to_pandas() + pd_result = dataframe_pd.where(func, lambda x: x * 2) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_series_other(scalars_df_index): + # When other is a series, throw an error. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + + with pytest.raises( + ValueError, + match="Seires is not a supported replacement type!", + ): + dataframe_bf.where(dataframe_bf > 0, dataframe_bf["int64_col"]) + + +def test_drop_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + df_pandas = scalars_df.drop(columns=col_name).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.drop(columns=col_name).columns + ) + + +def test_drop_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["int64_col", "geography_col", "time_col"] + df_pandas = scalars_df.drop(columns=col_names).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.drop(columns=col_names).columns + ) + + +def test_drop_labels_axis_1(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + labels = ["int64_col", "geography_col", "time_col"] + + pd_result = scalars_pandas_df.drop(labels=labels, axis=1) + bf_result = scalars_df.drop(labels=labels, axis=1).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_with_custom_column_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + dropped_columns = [ + "言語列", + "timestamp_col", + ] + bf_df = scalars_df.rename(columns=rename_mapping).drop(columns=dropped_columns) + bf_result = bf_df.to_pandas() + pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( + columns=dropped_columns + ) + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_memory_usage(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.memory_usage() + bf_result = scalars_df.memory_usage() + + pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) + + +def test_df_info(scalars_dfs): + expected = ( + "\n" + "Index: 9 entries, 0 to 8\n" + "Data columns (total 14 columns):\n" + " # Column Non-Null Count Dtype\n" + "--- ------------- ---------------- ------------------------------\n" + " 0 bool_col 8 non-null boolean\n" + " 1 bytes_col 6 non-null binary[pyarrow]\n" + " 2 date_col 7 non-null date32[day][pyarrow]\n" + " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" + " 4 geography_col 4 non-null geometry\n" + " 5 int64_col 8 non-null Int64\n" + " 6 int64_too 9 non-null Int64\n" + " 7 numeric_col 6 non-null decimal128(38, 9)[pyarrow]\n" + " 8 float64_col 7 non-null Float64\n" + " 9 rowindex_2 9 non-null Int64\n" + " 10 string_col 8 non-null string\n" + " 11 time_col 6 non-null time64[us][pyarrow]\n" + " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" + " 13 duration_col 7 non-null duration[us][pyarrow]\n" + "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), duration[us][pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "memory usage: 1341 bytes\n" + ) + + scalars_df, _ = scalars_dfs + bf_result = io.StringIO() + + scalars_df.info(buf=bf_result) + + assert expected == bf_result.getvalue() + + +@pytest.mark.parametrize( + ("include", "exclude"), + [ + ("Int64", None), + (["int"], None), + ("number", None), + ([pd.Int64Dtype(), pd.BooleanDtype()], None), + (None, [pd.Int64Dtype(), pd.BooleanDtype()]), + ("Int64", ["boolean"]), + ], +) +def test_select_dtypes(scalars_dfs, include, exclude): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude) + bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(index=[4, 1, 2]) + bf_result = scalars_df.drop(index=[4, 1, 2]).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_pandas_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + drop_index = scalars_pandas_df.iloc[[4, 1, 2]].index + + pd_result = scalars_pandas_df.drop(index=drop_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + drop_index = scalars_df.loc[[4, 1, 2]].index + drop_pandas_index = scalars_pandas_df.loc[[4, 1, 2]].index + + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_index_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index("bytes_col") + scalars_pandas_df = scalars_pandas_df.set_index("bytes_col") + drop_index = scalars_df.iloc[[3, 5]].index + drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index + + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_multiindex(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + sub_df = scalars_df.iloc[[4, 1, 2]] + sub_pandas_df = scalars_pandas_df.iloc[[4, 1, 2]] + sub_df = sub_df.set_index(["bytes_col", "numeric_col"]) + sub_pandas_df = sub_pandas_df.set_index(["bytes_col", "numeric_col"]) + drop_index = sub_df.index + drop_pandas_index = sub_pandas_df.index + + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_labels_axis_0(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(labels=[4, 1, 2], axis=0) + bf_result = scalars_df.drop(labels=[4, 1, 2], axis=0).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_index_and_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(index=[4, 1, 2], columns="int64_col") + bf_result = scalars_df.drop(index=[4, 1, 2], columns="int64_col").to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"bool_col": 1.2345} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_df_peek(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + peek_result = scalars_df.peek(n=3, force=False, allow_large_results=True) + + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_with_large_results_not_allowed(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + peek_result = scalars_df.peek(n=3, force=False, allow_large_results=False) + + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[scalars_df.int64_col != 0].peek(n=3, force=False) + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_exception(scalars_dfs): + scalars_df, _ = scalars_dfs + + with pytest.raises(ValueError): + # Window ops aren't compatible with efficient peeking + scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3, force=False) + + +def test_df_peek_force_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + +def test_df_peek_reset_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = ( + scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3) + ) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + +def test_repr_w_all_rows(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + # Remove columns with flaky formatting, like NUMERIC columns (which use the + # object dtype). Also makes a copy so that mutating the index name doesn't + # break other tests. + scalars_df = scalars_df.drop(columns=["numeric_col"]) + scalars_pandas_df = scalars_pandas_df.drop(columns=["numeric_col"]) + + # When there are 10 or fewer rows, the outputs should be identical. + actual = repr(scalars_df.head(10)) + + with display_options.pandas_repr(bigframes.options.display): + expected = repr(scalars_pandas_df.head(10)) + + assert actual == expected + + +def test_join_repr(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + scalars_df = ( + scalars_df[["int64_col"]] + .join(scalars_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + scalars_pandas_df = ( + scalars_pandas_df[["int64_col"]] + .join(scalars_pandas_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + # Pandas join result index name seems to depend on the index values in a way that bigframes can't match exactly + scalars_pandas_df.index.name = None + + actual = repr(scalars_df) + + with display_options.pandas_repr(bigframes.options.display): + expected = repr(scalars_pandas_df) + + assert actual == expected + + +def test_repr_w_display_options(scalars_dfs, session): + metrics = session._metrics + scalars_df, _ = scalars_dfs + # get a pandas df of the expected format + df, _ = scalars_df._block.to_pandas() + pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) + pandas_df.index.name = scalars_df.index.name + + executions_pre = metrics.execution_count + with bigframes.option_context( + "display.max_rows", 10, "display.max_columns", 5, "display.max_colwidth", 10 + ): + + # When there are 10 or fewer rows, the outputs should be identical except for the extra note. + actual = scalars_df.head(10).__repr__() + executions_post = metrics.execution_count + + with display_options.pandas_repr(bigframes.options.display): + pandas_repr = pandas_df.head(10).__repr__() + + assert actual == pandas_repr + assert (executions_post - executions_pre) <= 3 + + +def test_repr_html_w_all_rows(scalars_dfs, session): + metrics = session._metrics + scalars_df, _ = scalars_dfs + # get a pandas df of the expected format + df, _ = scalars_df._block.to_pandas() + pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) + pandas_df.index.name = scalars_df.index.name + + executions_pre = metrics.execution_count + # When there are 10 or fewer rows, the outputs should be identical except for the extra note. + actual = scalars_df.head(10)._repr_html_() + executions_post = metrics.execution_count + + with display_options.pandas_repr(bigframes.options.display): + pandas_repr = pandas_df.head(10)._repr_html_() + + expected = ( + pandas_repr + + f"[{len(pandas_df.index)} rows x {len(pandas_df.columns)} columns in total]" + ) + assert actual == expected + assert (executions_post - executions_pre) <= 3 + + +def test_df_column_name_with_space(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"bool_col": "bool col"} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_df_column_name_duplicate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"int64_too": "int64_col"} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_get_df_column_name_duplicate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"int64_too": "int64_col"} + + bf_result = scalars_df.rename(columns=col_name_dict)["int64_col"].to_pandas() + pd_result = scalars_pandas_df.rename(columns=col_name_dict)["int64_col"] + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + + +@pytest.mark.parametrize( + ("indices", "axis"), + [ + ([1, 3, 5], 0), + ([2, 4, 6], 1), + ([1, -3, -5, -6], "index"), + ([-2, -4, -6], "columns"), + ], +) +def test_take_df(scalars_dfs, indices, axis): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.take(indices, axis=axis).to_pandas() + pd_result = scalars_pandas_df.take(indices, axis=axis) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_filter_df(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_bool_series = scalars_df["bool_col"] + bf_result = scalars_df[bf_bool_series].to_pandas() + + pd_bool_series = scalars_pandas_df["bool_col"] + pd_result = scalars_pandas_df[pd_bool_series] + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_read_gbq_direct_to_batches_row_count(unordered_session): + df = unordered_session.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") + iter = df.to_pandas_batches() + assert iter.total_rows == 5552452 + + +def test_df_to_pandas_batches(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + capped_unfiltered_batches = scalars_df.to_pandas_batches(page_size=2, max_results=6) + bf_bool_series = scalars_df["bool_col"] + filtered_batches = scalars_df[bf_bool_series].to_pandas_batches() + + pd_bool_series = scalars_pandas_df["bool_col"] + pd_result = scalars_pandas_df[pd_bool_series] + + assert 6 == capped_unfiltered_batches.total_rows + assert len(pd_result) == filtered_batches.total_rows + assert_pandas_df_equal(pd.concat(filtered_batches), pd_result) + + +@pytest.mark.parametrize( + ("literal", "expected_dtype"), + ( + pytest.param( + 2, + dtypes.INT_DTYPE, + id="INT64", + ), + # ==================================================================== + # NULL values + # + # These are regression tests for b/428999884. It needs to be possible to + # set a column to NULL with a desired type (not just the pandas default + # of float64). + # ==================================================================== + pytest.param(None, dtypes.FLOAT_DTYPE, id="NULL-None"), + pytest.param( + pa.scalar(None, type=pa.int64()), + dtypes.INT_DTYPE, + id="NULL-pyarrow-TIMESTAMP", + ), + pytest.param( + pa.scalar(None, type=pa.timestamp("us", tz="UTC")), + dtypes.TIMESTAMP_DTYPE, + id="NULL-pyarrow-TIMESTAMP", + ), + pytest.param( + pa.scalar(None, type=pa.timestamp("us")), + dtypes.DATETIME_DTYPE, + id="NULL-pyarrow-DATETIME", + ), + ), +) +def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype): + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.assign(new_col=literal) + bf_result = df.to_pandas() + + new_col_pd = literal + if isinstance(literal, pa.Scalar): + # PyArrow integer scalars aren't yet supported in pandas Int64Dtype. + new_col_pd = literal.as_py() + + # Pandas might not pick the same dtype as BigFrames, but it should at least + # be castable to it. + pd_result = scalars_pandas_df.assign(new_col=new_col_pd) + pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_new_column_w_loc(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[:, "new_col"] = 2 + pd_df.loc[:, "new_col"] = 2 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("scalar",), + [ + (2.1,), + (None,), + ], +) +def test_assign_new_column_w_setitem(scalars_dfs, scalar): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = scalar + pd_df["new_col"] = scalar + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `float64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Float64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_dataframe(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["int64_col"] = bf_df["int64_too"].to_frame() + pd_df["int64_col"] = pd_df["int64_too"].to_frame() + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_df["int64_col"] = pd_df["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_assign_new_column_w_setitem_dataframe_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + bf_df["impossible_col"] = bf_df[["int64_too", "string_col"]] + with pytest.raises(ValueError): + pd_df["impossible_col"] = pd_df[["int64_too", "string_col"]] + + +def test_assign_new_column_w_setitem_list(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + pd_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col_2"] = pd_result["new_col_2"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + # set the custom index + pd_df = pd_df.set_index(["string_col", "int64_col"]) + bf_df = bf_df.set_index(["string_col", "int64_col"]) + + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + pd_df["new_col"] = [1, 2, 3] # should be len 9, is 3 + with pytest.raises(ValueError): + bf_df["new_col"] = [1, 2, 3] + + +@pytest.mark.parametrize( + ("key", "value"), + [ + pytest.param(["int64_col", "int64_too"], 1, id="scalar_to_existing_column"), + pytest.param( + ["int64_col", "int64_too"], [1, 2], id="sequence_to_existing_column" + ), + pytest.param( + ["int64_col", "new_col"], [1, 2], id="sequence_to_partial_new_column" + ), + pytest.param( + ["new_col", "new_col_too"], [1, 2], id="sequence_to_full_new_column" + ), + pytest.param( + pd.Index(("new_col", "new_col_too")), + [1, 2], + id="sequence_to_full_new_column_as_index", + ), + ], +) +def test_setitem_multicolumn_with_literals(scalars_dfs, key, value): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.copy() + pd_result = scalars_pandas_df.copy() + + bf_result[key] = value + pd_result[key] = value + + pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False) + + +def test_setitem_multicolumn_with_literals_different_lengths_raise_error(scalars_dfs): + scalars_df, _ = scalars_dfs + bf_result = scalars_df.copy() + + with pytest.raises(ValueError): + bf_result[["int64_col", "int64_too"]] = [1] + + +def test_setitem_multicolumn_with_dataframes(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.copy() + pd_result = scalars_pandas_df.copy() + + bf_result[["int64_col", "int64_too"]] = bf_result[["int64_too", "int64_col"]] / 2 + pd_result[["int64_col", "int64_too"]] = pd_result[["int64_too", "int64_col"]] / 2 + + pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False) + + +def test_setitem_multicolumn_with_dataframes_series_on_rhs_raise_error(scalars_dfs): + scalars_df, _ = scalars_dfs + bf_result = scalars_df.copy() + + with pytest.raises(ValueError): + bf_result[["int64_col", "int64_too"]] = bf_result["int64_col"] / 2 + + +def test_setitem_multicolumn_with_dataframes_different_lengths_raise_error(scalars_dfs): + scalars_df, _ = scalars_dfs + bf_result = scalars_df.copy() + + with pytest.raises(ValueError): + bf_result[["int64_col"]] = bf_result[["int64_col", "int64_too"]] / 2 + + +def test_assign_existing_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"int64_col": 2} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_listlike_to_empty_df(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() + + bf_result = empty_df.assign(new_col=[1, 2, 3]) + pd_result = empty_pandas_df.assign(new_col=[1, 2, 3]) + + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result.index = pd_result.index.astype("Int64") + assert_pandas_df_equal(bf_result.to_pandas(), pd_result) + + +def test_assign_to_empty_df_multiindex_error(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() + + empty_df["empty_col_1"] = typing.cast(series.Series, []) + empty_df["empty_col_2"] = typing.cast(series.Series, []) + empty_pandas_df["empty_col_1"] = [] + empty_pandas_df["empty_col_2"] = [] + empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"]) + empty_pandas_df = empty_pandas_df.set_index(["empty_col_1", "empty_col_2"]) + + with pytest.raises(ValueError): + empty_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + with pytest.raises(ValueError): + empty_pandas_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_assign_series(scalars_dfs, ordered): + scalars_df, scalars_pandas_df = scalars_dfs + column_name = "int64_col" + df = scalars_df.assign(new_col=scalars_df[column_name]) + bf_result = df.to_pandas(ordered=ordered) + pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +def test_assign_series_overwrite(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + column_name = "int64_col" + df = scalars_df.assign(**{column_name: scalars_df[column_name] + 3}) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign( + **{column_name: scalars_pandas_df[column_name] + 3} + ) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_sequential(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"int64_col": 2, "new_col": 3, "new_col2": 4} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +# Require an index so that the self-join is consistent each time. +def test_assign_same_table_different_index_performs_self_join( + scalars_df_index, scalars_pandas_df_index +): + column_name = "int64_col" + bf_df = scalars_df_index.assign( + alternative_index=scalars_df_index["rowindex_2"] + 2 + ) + pd_df = scalars_pandas_df_index.assign( + alternative_index=scalars_pandas_df_index["rowindex_2"] + 2 + ) + bf_df_2 = bf_df.set_index("alternative_index") + pd_df_2 = pd_df.set_index("alternative_index") + bf_result = bf_df.assign(new_col=bf_df_2[column_name] * 10).to_pandas() + pd_result = pd_df.assign(new_col=pd_df_2[column_name] * 10) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +# Different table expression must have Index +def test_assign_different_df( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + column_name = "int64_col" + df = scalars_df_index.assign(new_col=scalars_df_2_index[column_name]) + bf_result = df.to_pandas() + # Doesn't matter to pandas if it comes from the same DF or a different DF. + pd_result = scalars_pandas_df_index.assign( + new_col=scalars_pandas_df_index[column_name] + ) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_different_df_w_loc( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + bf_df = scalars_df_index.copy() + bf_df2 = scalars_df_2_index.copy() + pd_df = scalars_pandas_df_index.copy() + assert "int64_col" in bf_df.columns + assert "int64_col" in pd_df.columns + bf_df.loc[:, "int64_col"] = bf_df2.loc[:, "int64_col"] + 1 + pd_df.loc[:, "int64_col"] = pd_df.loc[:, "int64_col"] + 1 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_different_df_w_setitem( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + bf_df = scalars_df_index.copy() + bf_df2 = scalars_df_2_index.copy() + pd_df = scalars_pandas_df_index.copy() + assert "int64_col" in bf_df.columns + assert "int64_col" in pd_df.columns + bf_df["int64_col"] = bf_df2["int64_col"] + 1 + pd_df["int64_col"] = pd_df["int64_col"] + 1 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_callable_lambda(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"new_col": lambda x: x["int64_col"] + x["int64_too"]} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("axis", "how", "ignore_index", "subset"), + [ + (0, "any", False, None), + (0, "any", True, None), + (0, "all", False, ["bool_col", "time_col"]), + (0, "any", False, ["bool_col", "time_col"]), + (0, "all", False, "time_col"), + (1, "any", False, None), + (1, "all", False, None), + ], +) +def test_df_dropna_by_how(scalars_dfs, axis, how, ignore_index, subset): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("axis", "ignore_index", "subset", "thresh"), + [ + (0, False, None, 2), + (0, True, None, 3), + (1, False, None, 2), + ], +) +def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh): + """ + Tests that dropna correctly keeps rows/columns with a minimum number + of non-null values. + """ + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + df_result = scalars_df.dropna( + axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset + ) + pd_result = scalars_pandas_df.dropna( + axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset + ) + + bf_result = df_result.to_pandas() + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_dropna_range_columns(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df.columns = pandas.RangeIndex(0, len(scalars_df.columns)) + scalars_pandas_df.columns = pandas.RangeIndex(0, len(scalars_pandas_df.columns)) + + df = scalars_df.dropna() + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.dropna() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_interpolate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + bf_result = scalars_df[columns].interpolate().to_pandas() + # Pandas can only interpolate on "float64" columns + # https://github.com/pandas-dev/pandas/issues/40252 + pd_result = scalars_pandas_df[columns].astype("float64").interpolate() + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "col, fill_value", + [ + (["int64_col", "float64_col"], 3), + (["string_col"], "A"), + (["datetime_col"], pd.Timestamp("2023-01-01")), + ], +) +def test_df_fillna(scalars_dfs, col, fill_value): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col].fillna(fill_value).to_pandas() + pd_result = scalars_pandas_df[col].fillna(fill_value) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_replace_scalar_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace(555.555, 3).to_pandas() + pd_result = scalars_pandas_df.replace(555.555, 3) + + # pandas has narrower result types as they are determined dynamically + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + +def test_df_replace_regex_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas() + pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True) + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_list_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace([555.555, 3.2], 3).to_pandas() + pd_result = scalars_pandas_df.replace([555.555, 3.2], 3) + + # pandas has narrower result types as they are determined dynamically + pd.testing.assert_frame_equal( + pd_result, + bf_result, + check_dtype=False, + ) + + +def test_df_replace_value_dict(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas() + pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200}) + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_ffill(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].ffill(limit=1) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_bfill(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[["int64_col", "float64_col"]].bfill().to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].bfill() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_series_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(series, arg1, arg2, *, kwarg1=0, kwarg2=0): + return series**2 + (arg1 * arg2 % 4) + (kwarg1 * kwarg2 % 7) + + bf_result = ( + scalars_df_index[columns] + .apply(foo, args=(33, 61), kwarg1=52, kwarg2=21) + .to_pandas() + ) + + pd_result = scalars_pandas_df_index[columns].apply( + foo, args=(33, 61), kwarg1=52, kwarg2=21 + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_listlike_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + bf_result = ( + scalars_df_index[columns].apply(lambda x: [len(x), x.min(), 24]).to_pandas() + ) + + pd_result = scalars_pandas_df_index[columns].apply(lambda x: [len(x), x.min(), 24]) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result.index = pd_result.index.astype("Int64") + pd_result = pd_result.astype("Int64") + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_scalar_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + bf_result = scalars_df_index[columns].apply(lambda x: x.sum()) + + pd_result = scalars_pandas_df_index[columns].apply(lambda x: x.sum()) + + pandas.testing.assert_series_equal(bf_result, pd_result) + + +def test_df_pipe( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(x: int, y: int, df): + return (df + x) % y + + bf_result = ( + scalars_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + .to_pandas() + ) + + pd_result = ( + scalars_pandas_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_keys( + scalars_df_index, + scalars_pandas_df_index, +): + pandas.testing.assert_index_equal( + scalars_df_index.keys(), scalars_pandas_df_index.keys() + ) + + +def test_df_iter( + scalars_df_index, + scalars_pandas_df_index, +): + for bf_i, df_i in zip(scalars_df_index, scalars_pandas_df_index): + assert bf_i == df_i + + +def test_iterrows( + scalars_df_index, + scalars_pandas_df_index, +): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df_index = scalars_df_index.add_suffix("_suffix", axis=1) + scalars_pandas_df_index = scalars_pandas_df_index.add_suffix("_suffix", axis=1) + for (bf_index, bf_series), (pd_index, pd_series) in zip( + scalars_df_index.iterrows(), scalars_pandas_df_index.iterrows() + ): + assert bf_index == pd_index + pandas.testing.assert_series_equal(bf_series, pd_series) + + +@pytest.mark.parametrize( + ( + "index", + "name", + ), + [ + ( + True, + "my_df", + ), + (False, None), + ], +) +def test_itertuples(scalars_df_index, index, name): + # Numeric has slightly different representation as a result of conversions. + bf_tuples = scalars_df_index.itertuples(index, name) + pd_tuples = scalars_df_index.to_pandas().itertuples(index, name) + for bf_tuple, pd_tuple in zip(bf_tuples, pd_tuples): + assert bf_tuple == pd_tuple + + +def test_df_isin_list_w_null(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + values = ["Hello, World!", 55555, 2.51, pd.NA, True] + bf_result = ( + scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] + .isin(values) + .to_pandas() + ) + pd_result = scalars_pandas_df[ + ["int64_col", "float64_col", "string_col", "bool_col"] + ].isin(values) + + pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) + + +def test_df_isin_list_wo_null(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + values = ["Hello, World!", 55555, 2.51, True] + bf_result = ( + scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] + .isin(values) + .to_pandas() + ) + pd_result = scalars_pandas_df[ + ["int64_col", "float64_col", "string_col", "bool_col"] + ].isin(values) + + pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) + + +def test_df_isin_dict(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + values = { + "string_col": ["Hello, World!", 55555, 2.51, pd.NA, True], + "int64_col": [5555, 2.51], + "bool_col": [pd.NA], + } + bf_result = ( + scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] + .isin(values) + .to_pandas() + ) + pd_result = scalars_pandas_df[ + ["int64_col", "float64_col", "string_col", "bool_col"] + ].isin(values) + + pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) + + +def test_df_cross_merge(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + bf_result = left.merge(right, "cross").to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "cross", + ) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_df_merge(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + on = "rowindex_2" + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + df = left.merge(right, merge_how, on, sort=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + merge_how, + on, + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("left_on", "right_on"), + [ + (["int64_col", "rowindex_2"], ["int64_col", "rowindex_2"]), + (["rowindex_2", "int64_col"], ["int64_col", "rowindex_2"]), + (["rowindex_2", "float64_col"], ["int64_col", "rowindex_2"]), + ], +) +def test_df_merge_multi_key(scalars_dfs, left_on, right_on): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + df = left.merge(right, "outer", left_on=left_on, right_on=right_on, sort=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "outer", + left_on=left_on, + right_on=right_on, + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_custom_col_name(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col"] + right_columns = ["int64_col", "bool_col", "string_col"] + on = "int64_col" + rename_columns = {"float64_col": "f64_col"} + + left = scalars_df[left_columns] + left = left.rename(columns=rename_columns) + right = scalars_df[right_columns] + df = left.merge(right, merge_how, on, sort=True) + bf_result = df.to_pandas() + + pandas_left_df = scalars_pandas_df[left_columns] + pandas_left_df = pandas_left_df.rename(columns=rename_columns) + pandas_right_df = scalars_pandas_df[right_columns] + pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_left_on_right_on(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "int64_too"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + right = scalars_df[right_columns] + + df = left.merge( + right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True + ) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns], + merge_how, + left_on="int64_too", + right_on="rowindex_2", + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +def test_self_merge_self_w_on_args(): + data = { + "A": pd.Series([1, 2, 3], dtype="Int64"), + "B": pd.Series([1, 2, 3], dtype="Int64"), + "C": pd.Series([100, 200, 300], dtype="Int64"), + "D": pd.Series(["alpha", "beta", "gamma"], dtype="string[pyarrow]"), + } + df = pd.DataFrame(data) + + df1 = df[["A", "C"]] + df2 = df[["B", "C", "D"]] + pd_result = df1.merge(df2, left_on=["A", "C"], right_on=["B", "C"], how="inner") + + bf_df = bpd.DataFrame(data) + + bf_df1 = bf_df[["A", "C"]] + bf_df2 = bf_df[["B", "C", "D"]] + bf_result = bf_df1.merge( + bf_df2, left_on=["A", "C"], right_on=["B", "C"], how="inner" + ).to_pandas() + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("decimals",), + [ + (2,), + ({"float64_col": 0, "bool_col": 1, "int64_too": -3},), + ({},), + ], +) +def test_dataframe_round(scalars_dfs, decimals): + if pd.__version__.startswith("1."): + pytest.skip("Rounding doesn't work as expected in pandas 1.x") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.round(decimals).to_pandas() + pd_result = scalars_pandas_df.round(decimals) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_get_dtypes(scalars_df_default_index): + dtypes = scalars_df_default_index.dtypes + dtypes_dict: Dict[str, bigframes.dtypes.Dtype] = { + "bool_col": pd.BooleanDtype(), + "bytes_col": pd.ArrowDtype(pa.binary()), + "date_col": pd.ArrowDtype(pa.date32()), + "datetime_col": pd.ArrowDtype(pa.timestamp("us")), + "geography_col": gpd.array.GeometryDtype(), + "int64_col": pd.Int64Dtype(), + "int64_too": pd.Int64Dtype(), + "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), + "float64_col": pd.Float64Dtype(), + "rowindex": pd.Int64Dtype(), + "rowindex_2": pd.Int64Dtype(), + "string_col": pd.StringDtype(storage="pyarrow"), + "time_col": pd.ArrowDtype(pa.time64("us")), + "timestamp_col": pd.ArrowDtype(pa.timestamp("us", tz="UTC")), + "duration_col": pd.ArrowDtype(pa.duration("us")), + } + pd.testing.assert_series_equal( + dtypes, + pd.Series(dtypes_dict), + ) + + +def test_get_dtypes_array_struct_query(session): + df = session.read_gbq( + """SELECT + [1, 3, 2] AS array_column, + STRUCT( + "a" AS string_field, + 1.2 AS float_field) AS struct_column""" + ) + + dtypes = df.dtypes + pd.testing.assert_series_equal( + dtypes, + pd.Series( + { + "array_column": pd.ArrowDtype(pa.list_(pa.int64())), + "struct_column": pd.ArrowDtype( + pa.struct( + [ + ("string_field", pa.string()), + ("float_field", pa.float64()), + ] + ) + ), + } + ), + ) + + +def test_get_dtypes_array_struct_table(nested_df): + dtypes = nested_df.dtypes + pd.testing.assert_series_equal( + dtypes, + pd.Series( + { + "customer_id": pd.StringDtype(storage="pyarrow"), + "day": pd.ArrowDtype(pa.date32()), + "flag": pd.Int64Dtype(), + "label": pd.ArrowDtype( + pa.struct( + [ + ("key", pa.string()), + ("value", pa.string()), + ] + ), + ), + "event_sequence": pd.ArrowDtype( + pa.list_( + pa.struct( + [ + pa.field( + "data", + pa.list_( + pa.struct( + [ + ("value", pa.float64()), + ("key", pa.string()), + ], + ), + ), + nullable=False, + ), + ("timestamp", pa.timestamp("us", "UTC")), + ("category", pa.string()), + ] + ), + ), + ), + "address": pd.ArrowDtype( + pa.struct( + [ + ("street", pa.string()), + ("city", pa.string()), + ] + ), + ), + } + ), + ) + + +def test_shape(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.shape + pd_result = scalars_pandas_df.shape + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + "reference_table, test_table", + [ + ( + "bigframes-dev.bigframes_tests_sys.base_table", + "bigframes-dev.bigframes_tests_sys.base_table_mat_view", + ), + ( + "bigframes-dev.bigframes_tests_sys.base_table", + "bigframes-dev.bigframes_tests_sys.base_table_view", + ), + ( + "bigframes-dev.bigframes_tests_sys.csv_native_table", + "bigframes-dev.bigframes_tests_sys.csv_external_table", + ), + ], +) +def test_view_and_external_table_shape(session, reference_table, test_table): + reference_df = session.read_gbq(reference_table) + test_df = session.read_gbq(test_table) + + assert test_df.shape == reference_df.shape + + +def test_len(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = len(scalars_df) + pd_result = len(scalars_pandas_df) + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("n_rows",), + [ + (50,), + (10000,), + ], +) +@pytest.mark.parametrize( + "write_engine", + ["bigquery_load", "bigquery_streaming", "bigquery_write"], +) +def test_df_len_local(session, n_rows, write_engine): + assert ( + len( + session.read_pandas( + pd.DataFrame(np.random.randint(1, 7, n_rows), columns=["one"]), + write_engine=write_engine, + ) + ) + == n_rows + ) + + +def test_size(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.size + pd_result = scalars_pandas_df.size + + assert bf_result == pd_result + + +def test_ndim(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.ndim + pd_result = scalars_pandas_df.ndim + + assert bf_result == pd_result + + +def test_empty_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.empty + pd_result = scalars_pandas_df.empty + + assert bf_result == pd_result + + +def test_empty_true_column_filter(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[[]].empty + pd_result = scalars_pandas_df[[]].empty + + assert bf_result == pd_result + + +def test_empty_true_row_filter(scalars_dfs: Tuple[dataframe.DataFrame, pd.DataFrame]): + scalars_df, scalars_pandas_df = scalars_dfs + bf_bool: series.Series = typing.cast(series.Series, scalars_df["bool_col"]) + pd_bool: pd.Series = scalars_pandas_df["bool_col"] + bf_false = bf_bool.notna() & (bf_bool != bf_bool) + pd_false = pd_bool.notna() & (pd_bool != pd_bool) + + bf_result = scalars_df[bf_false].empty + pd_result = scalars_pandas_df[pd_false].empty + + assert pd_result + assert bf_result == pd_result + + +def test_empty_true_memtable(session: bigframes.Session): + bf_df = dataframe.DataFrame(session=session) + pd_df = pd.DataFrame() + + bf_result = bf_df.empty + pd_result = pd_df.empty + + assert pd_result + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("drop",), + ((True,), (False,)), +) +def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop): + df = scalars_df_index.reset_index(drop=drop) + assert df.index.name is None + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.reset_index(drop=drop) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy() + scalars_df_index.index.name = "int64_col" + df = scalars_df_index.reset_index(allow_duplicates=True, drop=False) + assert df.index.name is None + + bf_result = df.to_pandas() + + scalars_pandas_df_index = scalars_pandas_df_index.copy() + scalars_pandas_df_index.index.name = "int64_col" + pd_result = scalars_pandas_df_index.reset_index(allow_duplicates=True, drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_duplicates_error(scalars_df_index): + scalars_df_index = scalars_df_index.copy() + scalars_df_index.index.name = "int64_col" + with pytest.raises(ValueError): + scalars_df_index.reset_index(allow_duplicates=False, drop=False) + + +@pytest.mark.parametrize( + ("drop",), + ((True,), (False,)), +) +def test_reset_index_inplace(scalars_df_index, scalars_pandas_df_index, drop): + df = scalars_df_index.copy() + df.reset_index(drop=drop, inplace=True) + assert df.index.name is None + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.copy() + pd_result.reset_index(drop=drop, inplace=True) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_then_filter( + scalars_df_index, + scalars_pandas_df_index, +): + bf_filter = scalars_df_index["bool_col"].fillna(True) + bf_df = scalars_df_index.reset_index()[bf_filter] + bf_result = bf_df.to_pandas() + pd_filter = scalars_pandas_df_index["bool_col"].fillna(True) + pd_result = scalars_pandas_df_index.reset_index()[pd_filter] + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering and index keys + # post-filter will have gaps. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_with_unnamed_index( + scalars_df_index, + scalars_pandas_df_index, +): + scalars_df_index = scalars_df_index.copy() + scalars_pandas_df_index = scalars_pandas_df_index.copy() + + scalars_df_index.index.name = None + scalars_pandas_df_index.index.name = None + df = scalars_df_index.reset_index(drop=False) + assert df.index.name is None + + # reset_index(drop=False) creates a new column "index". + assert df.columns[0] == "index" + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.reset_index(drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_with_unnamed_multiindex( + scalars_df_index, + scalars_pandas_df_index, +): + bf_df = dataframe.DataFrame( + ([1, 2, 3], [2, 5, 7]), + index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + ) + pd_df = pd.DataFrame( + ([1, 2, 3], [2, 5, 7]), + index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + ) + + bf_df = bf_df.reset_index() + pd_df = pd_df.reset_index() + + assert pd_df.columns[0] == "level_0" + assert bf_df.columns[0] == "level_0" + assert pd_df.columns[1] == "level_1" + assert bf_df.columns[1] == "level_1" + + +def test_reset_index_with_unnamed_index_and_index_column( + scalars_df_index, + scalars_pandas_df_index, +): + scalars_df_index = scalars_df_index.copy() + scalars_pandas_df_index = scalars_pandas_df_index.copy() + + scalars_df_index.index.name = None + scalars_pandas_df_index.index.name = None + df = scalars_df_index.assign(index=scalars_df_index["int64_col"]).reset_index( + drop=False + ) + assert df.index.name is None + + # reset_index(drop=False) creates a new column "level_0" if the "index" column already exists. + assert df.columns[0] == "level_0" + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.assign( + index=scalars_pandas_df_index["int64_col"] + ).reset_index(drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("drop",), + ( + (True,), + (False,), + ), +) +@pytest.mark.parametrize( + ("append",), + ( + (True,), + (False,), + ), +) +@pytest.mark.parametrize( + ("index_column",), + (("int64_too",), ("string_col",), ("timestamp_col",)), +) +def test_set_index(scalars_dfs, index_column, drop, append): + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.set_index(index_column, append=append, drop=drop) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.set_index(index_column, append=append, drop=drop) + + # Sort to disambiguate when there are duplicate index labels. + # Note: Doesn't use assert_pandas_df_equal_ignore_ordering because we get + # "ValueError: 'timestamp_col' is both an index level and a column label, + # which is ambiguous" when trying to sort by a column with the same name as + # the index. + bf_result = bf_result.sort_values("rowindex_2") + pd_result = pd_result.sort_values("rowindex_2") + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_set_index_key_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + with pytest.raises(KeyError): + scalars_pandas_df.set_index(["not_a_col"]) + with pytest.raises(KeyError): + scalars_df.set_index(["not_a_col"]) + + +@pytest.mark.parametrize( + ("ascending",), + ((True,), (False,)), +) +@pytest.mark.parametrize( + ("na_position",), + (("first",), ("last",)), +) +@pytest.mark.parametrize( + ("axis",), + ((0,), ("columns",)), +) +def test_sort_index(scalars_dfs, ascending, na_position, axis): + index_column = "int64_col" + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.set_index(index_column) + bf_result = df.sort_index( + ascending=ascending, na_position=na_position, axis=axis + ).to_pandas() + pd_result = scalars_pandas_df.set_index(index_column).sort_index( + ascending=ascending, na_position=na_position, axis=axis + ) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_dataframe_sort_index_inplace(scalars_dfs): + index_column = "int64_col" + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.copy().set_index(index_column) + df.sort_index(ascending=False, inplace=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df.set_index(index_column).sort_index(ascending=False) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_abs(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + columns = ["int64_col", "int64_too", "float64_col"] + + bf_result = scalars_df[columns].abs() + pd_result = scalars_pandas_df[columns].abs() + + assert_dfs_equivalent(pd_result, bf_result) + + +def test_df_pos(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (+scalars_df[["int64_col", "numeric_col"]]).to_pandas() + pd_result = +scalars_pandas_df[["int64_col", "numeric_col"]] + + assert_pandas_df_equal(pd_result, bf_result) + + +def test_df_neg(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (-scalars_df[["int64_col", "numeric_col"]]).to_pandas() + pd_result = -scalars_pandas_df[["int64_col", "numeric_col"]] + + assert_pandas_df_equal(pd_result, bf_result) + + +def test_df__abs__(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + abs(scalars_df[["int64_col", "numeric_col", "float64_col"]]) + ).to_pandas() + pd_result = abs(scalars_pandas_df[["int64_col", "numeric_col", "float64_col"]]) + + assert_pandas_df_equal(pd_result, bf_result) + + +def test_df_invert(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "bool_col"] + + bf_result = (~scalars_df[columns]).to_pandas() + pd_result = ~scalars_pandas_df[columns] + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_isnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + columns = ["int64_col", "int64_too", "string_col", "bool_col"] + bf_result = scalars_df[columns].isnull().to_pandas() + pd_result = scalars_pandas_df[columns].isnull() + + # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is + # `BooleanDtype` but the `pd_result.dtype` is `bool`. + pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) + pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) + pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) + pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_notnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + columns = ["int64_col", "int64_too", "string_col", "bool_col"] + bf_result = scalars_df[columns].notnull().to_pandas() + pd_result = scalars_pandas_df[columns].notnull() + + # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is + # `BooleanDtype` but the `pd_result.dtype` is `bool`. + pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) + pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) + pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) + pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("left_labels", "right_labels", "overwrite", "fill_value"), + [ + (["a", "b", "c"], ["c", "a", "b"], True, None), + (["a", "b", "c"], ["c", "a", "b"], False, None), + (["a", "b", "c"], ["a", "b", "c"], False, 2), + ], + ids=[ + "one_one_match_overwrite", + "one_one_match_no_overwrite", + "exact_match", + ], +) +def test_combine( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, + left_labels, + right_labels, + overwrite, + fill_value, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns] + bf_df_a.columns = left_labels + bf_df_b = scalars_df_2_index[columns] + bf_df_b.columns = right_labels + bf_result = bf_df_a.combine( + bf_df_b, + lambda x, y: x**2 + 2 * x * y + y**2, + overwrite=overwrite, + fill_value=fill_value, + ).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns] + pd_df_a.columns = left_labels + pd_df_b = scalars_pandas_df_index[columns] + pd_df_b.columns = right_labels + pd_result = pd_df_a.combine( + pd_df_b, + lambda x, y: x**2 + 2 * x * y + y**2, + overwrite=overwrite, + fill_value=fill_value, + ) + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("overwrite", "filter_func"), + [ + (True, None), + (False, None), + (True, lambda x: x.isna() | (x % 2 == 0)), + ], + ids=[ + "default", + "overwritefalse", + "customfilter", + ], +) +def test_df_update(overwrite, filter_func): + if pd.__version__.startswith("1."): + pytest.skip("dtype handled differently in pandas 1.x.") + + index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") + + index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func) + pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func) + + pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) + + +def test_df_idxmin(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmin().to_pandas() + pd_result = pd_df.idxmin() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +def test_df_idxmax(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmax().to_pandas() + pd_result = pd_df.idxmax() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("join", "axis"), + [ + ("outer", None), + ("outer", 0), + ("outer", 1), + ("left", 0), + ("right", 1), + ("inner", None), + ("inner", 1), + ], +) +def test_df_align(join, axis): + + index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") + + index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_result1, bf_result2 = bf_df1.align(bf_df2, join=join, axis=axis) + pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) + + # Don't check dtype as pandas does unnecessary float conversion + assert isinstance(bf_result1, dataframe.DataFrame) and isinstance( + bf_result2, dataframe.DataFrame + ) + pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) + pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) + + +def test_combine_first( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns].iloc[0:6] + bf_df_a.columns = ["a", "b", "c"] + bf_df_b = scalars_df_2_index[columns].iloc[2:8] + bf_df_b.columns = ["b", "a", "d"] + bf_result = bf_df_a.combine_first(bf_df_b).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns].iloc[0:6] + pd_df_a.columns = ["a", "b", "c"] + pd_df_b = scalars_pandas_df_index[columns].iloc[2:8] + pd_df_b.columns = ["b", "a", "d"] + pd_result = pd_df_a.combine_first(pd_df_b) + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("columns", "numeric_only"), + [ + (["bool_col", "int64_col", "float64_col"], True), + (["bool_col", "int64_col", "float64_col"], False), + (["bool_col", "int64_col", "float64_col", "string_col"], True), + pytest.param( + ["bool_col", "int64_col", "float64_col", "string_col"], + False, + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + ], +) +def test_df_corr_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + bf_result = scalars_df[columns].corr(numeric_only=numeric_only).to_pandas() + pd_result = scalars_pandas_df[columns].corr(numeric_only=numeric_only) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + # Only check row order in ordered mode. + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + check_like=~scalars_df._block.session._strictly_ordered, + ) + + +def test_df_corr_w_invalid_parameters(scalars_dfs): + columns = ["int64_too", "int64_col", "float64_col"] + scalars_df, _ = scalars_dfs + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(method="kendall") + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(min_periods=1) + + +@pytest.mark.parametrize( + ("columns", "numeric_only"), + [ + (["bool_col", "int64_col", "float64_col"], True), + (["bool_col", "int64_col", "float64_col"], False), + (["bool_col", "int64_col", "float64_col", "string_col"], True), + pytest.param( + ["bool_col", "int64_col", "float64_col", "string_col"], + False, + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + ], +) +def test_cov_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas() + pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only) + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + # Only check row order in ordered mode. + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + check_like=~scalars_df._block.session._strictly_ordered, + ) + + +def test_df_corrwith_df(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + l_cols = ["int64_col", "float64_col", "int64_too"] + r_cols = ["int64_too", "float64_col"] + + bf_result = scalars_df[l_cols].corrwith(scalars_df[r_cols]).to_pandas() + pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_cols]) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_corrwith_df_numeric_only(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] + r_cols = ["int64_too", "float64_col", "bool_col"] + + bf_result = ( + scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=True).to_pandas() + ) + pd_result = scalars_pandas_df[l_cols].corrwith( + scalars_pandas_df[r_cols], numeric_only=True + ) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_corrwith_df_non_numeric_error(scalars_dfs): + scalars_df, _ = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] + r_cols = ["int64_too", "float64_col", "bool_col"] + + with pytest.raises(NotImplementedError): + scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=False) + + +def test_df_corrwith_series(scalars_dfs_maybe_ordered): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + l_cols = ["int64_col", "float64_col", "int64_too"] + r_col = "float64_col" + + bf_result = scalars_df[l_cols].corrwith(scalars_df[r_col]).to_pandas() + pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_col]) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("op"), + [ + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.floordiv, + operator.eq, + operator.ne, + operator.gt, + operator.ge, + operator.lt, + operator.le, + ], + ids=[ + "add", + "subtract", + "multiply", + "true_divide", + "floor_divide", + "eq", + "ne", + "gt", + "ge", + "lt", + "le", + ], +) +# TODO(garrettwu): deal with NA values +@pytest.mark.parametrize(("other_scalar"), [1, 2.5, 0, 0.0]) +@pytest.mark.parametrize(("reverse_operands"), [True, False]) +def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "float64_col"] + + maybe_reversed_op = (lambda x, y: op(y, x)) if reverse_operands else op + + bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() + pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_dataframe_string_radd_const(scalars_dfs): + pytest.importorskip( + "pandas", + minversion="2.0.0", + reason="PyArrow string addition requires pandas 2.0+", + ) + + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["string_col", "string_col"] + + bf_result = ("prefix" + scalars_df[columns]).to_pandas() + pd_result = "prefix" + scalars_pandas_df[columns] + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize(("other_scalar"), [1, -2]) +def test_mod(scalars_dfs, other_scalar): + # Zero case excluded as pandas produces 0 result for Int64 inputs rather than NA/NaN. + # This is likely a pandas bug as mod 0 is undefined in other dtypes, and most programming languages. + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() + pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_scalar_binop_str_exception(scalars_dfs): + scalars_df, _ = scalars_dfs + columns = ["string_col"] + with pytest.raises(TypeError, match="Cannot add dtypes"): + (scalars_df[columns] + 1).to_pandas() + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x, y: x.add(y, axis="index")), + (lambda x, y: x.radd(y, axis="index")), + (lambda x, y: x.sub(y, axis="index")), + (lambda x, y: x.rsub(y, axis="index")), + (lambda x, y: x.mul(y, axis="index")), + (lambda x, y: x.rmul(y, axis="index")), + (lambda x, y: x.truediv(y, axis="index")), + (lambda x, y: x.rtruediv(y, axis="index")), + (lambda x, y: x.floordiv(y, axis="index")), + (lambda x, y: x.floordiv(y, axis="index")), + (lambda x, y: x.gt(y, axis="index")), + (lambda x, y: x.ge(y, axis="index")), + (lambda x, y: x.lt(y, axis="index")), + (lambda x, y: x.le(y, axis="index")), + ], + ids=[ + "add", + "radd", + "sub", + "rsub", + "mul", + "rmul", + "truediv", + "rtruediv", + "floordiv", + "rfloordiv", + "gt", + "ge", + "lt", + "le", + ], +) +def test_series_binop_axis_index( + scalars_dfs, + op, +): + scalars_df, scalars_pandas_df = scalars_dfs + df_columns = ["int64_col", "float64_col"] + series_column = "int64_too" + + bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() + pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("input"), + [ + ((1000, 2000, 3000)), + (pd.Index([1000, 2000, 3000])), + (pd.Series((1000, 2000), index=["int64_too", "float64_col"])), + ], + ids=[ + "tuple", + "pd_index", + "pd_series", + ], +) +def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas() + if hasattr(input, "to_pandas"): + input = input.to_pandas() + pd_result = scalars_pandas_df[df_columns].add(input, axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_reverse_binop_pandas(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + pd_series = pd.Series([100, 200, 300]) + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = pd_series + scalars_df[df_columns].to_pandas() + pd_result = pd_series + scalars_pandas_df[df_columns] + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_listlike_binop_axis_1_bf_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = ( + scalars_df[df_columns] + .add(bf_indexes.Index([1000, 2000, 3000]), axis=1) + .to_pandas() + ) + pd_result = scalars_pandas_df[df_columns].add(pd.Index([1000, 2000, 3000]), axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_binop_with_self_aggregate(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + df_columns = ["int64_col", "float64_col", "int64_too"] + + # Ensure that this takes the optimized single-query path by counting executions + execution_count_before = scalars_df._session._metrics.execution_count + bf_df = scalars_df[df_columns] + bf_result = (bf_df - bf_df.mean()).to_pandas() + execution_count_after = scalars_df._session._metrics.execution_count + + pd_df = scalars_pandas_df[df_columns] + pd_result = pd_df - pd_df.mean() + + executions = execution_count_after - execution_count_before + + assert executions == 1 + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_binop_with_self_aggregate_w_index_reset(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + df_columns = ["int64_col", "float64_col", "int64_too"] + + # Ensure that this takes the optimized single-query path by counting executions + execution_count_before = scalars_df._session._metrics.execution_count + bf_df = scalars_df[df_columns].reset_index(drop=True) + bf_result = (bf_df - bf_df.mean()).to_pandas() + execution_count_after = scalars_df._session._metrics.execution_count + + pd_df = scalars_pandas_df[df_columns].reset_index(drop=True) + pd_result = pd_df - pd_df.mean() + + executions = execution_count_after - execution_count_before + + assert executions == 1 + pd_result.index = pd_result.index.astype("Int64") + assert_pandas_df_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("left_labels", "right_labels"), + [ + (["a", "a", "b"], ["c", "c", "d"]), + (["a", "b", "c"], ["c", "a", "b"]), + (["a", "c", "c"], ["c", "a", "c"]), + (["a", "b", "c"], ["a", "b", "c"]), + ], + ids=[ + "no_overlap", + "one_one_match", + "multi_match", + "exact_match", + ], +) +def test_binop_df_df_binary_op( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, + left_labels, + right_labels, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns] + bf_df_a.columns = left_labels + bf_df_b = scalars_df_2_index[columns] + bf_df_b.columns = right_labels + bf_result = (bf_df_a - bf_df_b).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns] + pd_df_a.columns = left_labels + pd_df_b = scalars_pandas_df_index[columns] + pd_df_b.columns = right_labels + pd_result = pd_df_a - pd_df_b + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +# Differnt table will only work for explicit index, since default index orders are arbitrary. +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_series_binop_add_different_table( + scalars_df_index, scalars_pandas_df_index, scalars_df_2_index, ordered +): + df_columns = ["int64_col", "float64_col"] + series_column = "int64_too" + + bf_result = ( + scalars_df_index[df_columns] + .add(scalars_df_2_index[series_column], axis="index") + .to_pandas(ordered=ordered) + ) + pd_result = scalars_pandas_df_index[df_columns].add( + scalars_pandas_df_index[series_column], axis="index" + ) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +# TODO(garrettwu): Test series binop with different index + +all_joins = pytest.mark.parametrize( + ("how",), + (("outer",), ("left",), ("right",), ("inner",), ("cross",)), +) + + +@all_joins +def test_join_same_table(scalars_dfs_maybe_ordered, how): + bf_df, pd_df = scalars_dfs_maybe_ordered + + bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] + bf_df_a = bf_df_a.sort_index() + + bf_df_b = bf_df.set_index("int64_too")[["float64_col"]] + bf_df_b = bf_df_b[bf_df_b.float64_col > 0] + bf_df_b = bf_df_b.sort_values("float64_col") + + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() + + pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]].sort_index() + pd_df_a = pd_df_a.sort_index() + + pd_df_b = pd_df.set_index("int64_too")[["float64_col"]] + pd_df_b = pd_df_b[pd_df_b.float64_col > 0] + pd_df_b = pd_df_b.sort_values("float64_col") + + pd_result = pd_df_a.join(pd_df_b, how=how) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +def test_join_incompatible_key_type_error(scalars_dfs): + bf_df, _ = scalars_dfs + + bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] + bf_df_a = bf_df_a.sort_index() + + bf_df_b = bf_df.set_index("date_col")[["float64_col"]] + bf_df_b = bf_df_b[bf_df_b.float64_col > 0] + bf_df_b = bf_df_b.sort_values("float64_col") + + with pytest.raises(TypeError): + # joining incompatible date, int columns + bf_df_a.join(bf_df_b, how="left") + + +@all_joins +def test_join_different_table( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index, how +): + bf_df_a = scalars_df_index[["string_col", "int64_col"]] + bf_df_b = scalars_df_2_index.dropna()[["float64_col"]] + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() + pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] + pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@all_joins +def test_join_different_table_with_duplicate_column_name( + scalars_df_index, scalars_pandas_df_index, how +): + bf_df_a = scalars_df_index[["string_col", "int64_col", "int64_too"]].rename( + columns={"int64_too": "int64_col"} + ) + bf_df_b = scalars_df_index.dropna()[ + ["string_col", "int64_col", "int64_too"] + ].rename(columns={"int64_too": "int64_col"}) + bf_result = bf_df_a.join(bf_df_b, how=how, lsuffix="_l", rsuffix="_r").to_pandas() + pd_df_a = scalars_pandas_df_index[["string_col", "int64_col", "int64_too"]].rename( + columns={"int64_too": "int64_col"} + ) + pd_df_b = scalars_pandas_df_index.dropna()[ + ["string_col", "int64_col", "int64_too"] + ].rename(columns={"int64_too": "int64_col"}) + pd_result = pd_df_a.join(pd_df_b, how=how, lsuffix="_l", rsuffix="_r") + + # Ensure no inplace changes + pd.testing.assert_index_equal(bf_df_a.columns, pd_df_a.columns) + pd.testing.assert_index_equal(bf_df_b.index.to_pandas(), pd_df_b.index) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@all_joins +def test_join_param_on_with_duplicate_column_name_not_on_col( + scalars_df_index, scalars_pandas_df_index, how +): + # This test is for duplicate column names, but the 'on' column is not duplicated. + if how == "cross": + return + bf_df_a = scalars_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_df_b = scalars_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_result = bf_df_a.join( + bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ).to_pandas() + pd_df_a = scalars_pandas_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_df_b = scalars_pandas_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_result = pd_df_a.join( + pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ) + pd.testing.assert_frame_equal( + bf_result.sort_index(), + pd_result.sort_index(), + check_like=True, + check_index_type=False, + check_names=False, + ) + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + + +@pytest.mark.skipif( + pandas.__version__.startswith("1."), reason="bad left join in pandas 1.x" +) +@all_joins +def test_join_param_on_with_duplicate_column_name_on_col( + scalars_df_index, scalars_pandas_df_index, how +): + # This test is for duplicate column names, and the 'on' column is duplicated. + if how == "cross": + return + bf_df_a = scalars_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_df_b = scalars_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_result = bf_df_a.join( + bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ).to_pandas() + pd_df_a = scalars_pandas_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_df_b = scalars_pandas_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_result = pd_df_a.join( + pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ) + pd.testing.assert_frame_equal( + bf_result.sort_index(), + pd_result.sort_index(), + check_like=True, + check_index_type=False, + check_names=False, + ) + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + + +@all_joins +def test_join_param_on(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] + bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) + bf_df_b = bf_df[["float64_col"]] + + if how == "cross": + with pytest.raises(ValueError): + bf_df_a.join(bf_df_b, on="rowindex_2", how=how) + else: + bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_df_b = pd_df[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@all_joins +def test_df_join_series(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] + bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) + bf_series_b = bf_df["float64_col"] + + if how == "cross": + with pytest.raises(ValueError): + bf_df_a.join(bf_series_b, on="rowindex_2", how=how) + else: + bf_result = bf_df_a.join(bf_series_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_series_b = pd_df["float64_col"] + pd_result = pd_df_a.join(pd_series_b, on="rowindex_2", how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@pytest.mark.parametrize( + ("by", "ascending", "na_position"), + [ + ("int64_col", True, "first"), + (["bool_col", "int64_col"], True, "last"), + ("int64_col", False, "first"), + (["bool_col", "int64_col"], [False, True], "last"), + (["bool_col", "int64_col"], [True, False], "first"), + ], +) +def test_dataframe_sort_values( + scalars_df_index, scalars_pandas_df_index, by, ascending, na_position +): + # Test needs values to be unique + bf_result = scalars_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ).to_pandas() + pd_result = scalars_pandas_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ) + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("by", "ascending", "na_position"), + [ + ("int64_col", True, "first"), + (["bool_col", "int64_col"], True, "last"), + ], +) +def test_dataframe_sort_values_inplace( + scalars_df_index, scalars_pandas_df_index, by, ascending, na_position +): + # Test needs values to be unique + bf_sorted = scalars_df_index.copy() + bf_sorted.sort_values( + by, ascending=ascending, na_position=na_position, inplace=True + ) + bf_result = bf_sorted.to_pandas() + pd_result = scalars_pandas_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ) + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_dataframe_sort_values_invalid_input(scalars_df_index): + with pytest.raises(KeyError): + scalars_df_index.sort_values(by=scalars_df_index["int64_col"]) + + +def test_dataframe_sort_values_stable(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index.sort_values("int64_col", kind="stable") + .sort_values("bool_col", kind="stable") + .to_pandas() + ) + pd_result = scalars_pandas_df_index.sort_values( + "int64_col", kind="stable" + ).sort_values("bool_col", kind="stable") + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("operator", "columns"), + [ + pytest.param(lambda x: x.cumsum(), ["float64_col", "int64_too"]), + pytest.param(lambda x: x.cumprod(), ["float64_col", "int64_too"]), + pytest.param( + lambda x: x.cumprod(), + ["string_col"], + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], + ids=[ + "cumsum", + "cumprod", + "non-numeric", + ], +) +def test_dataframe_numeric_analytic_op( + scalars_df_index, scalars_pandas_df_index, operator, columns +): + # TODO: Add nullable ints (pandas 1.x has poor behavior on these) + bf_series = operator(scalars_df_index[columns]) + pd_series = operator(scalars_pandas_df_index[columns]) + bf_result = bf_series.to_pandas() + pd.testing.assert_frame_equal(pd_series, bf_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x: x.cummin()), + (lambda x: x.cummax()), + (lambda x: x.shift(2)), + (lambda x: x.shift(-2)), + ], + ids=[ + "cummin", + "cummax", + "shiftpostive", + "shiftnegative", + ], +) +def test_dataframe_general_analytic_op( + scalars_df_index, scalars_pandas_df_index, operator +): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] + bf_series = operator(scalars_df_index[col_names]) + pd_series = operator(scalars_pandas_df_index[col_names]) + bf_result = bf_series.to_pandas() + pd.testing.assert_frame_equal( + pd_series, + bf_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods): + col_names = ["int64_too", "float64_col", "int64_col"] + bf_result = scalars_df_index[col_names].diff(periods=periods).to_pandas() + pd_result = scalars_pandas_df_index[col_names].diff(periods=periods) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods): + col_names = ["int64_too", "float64_col", "int64_col"] + bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() + pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_dataframe_agg_single_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[numeric_cols].agg("sum").to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg("sum") + + assert bf_result.dtype == "Float64" + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("agg",), + ( + ("sum",), + ("size",), + ), +) +def test_dataframe_agg_int_single_string(scalars_dfs, agg): + numeric_cols = ["int64_col", "int64_too", "bool_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[numeric_cols].agg(agg).to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg(agg) + + assert bf_result.dtype == "Int64" + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_dataframe_agg_multi_string(scalars_dfs_maybe_ordered): + numeric_cols = ["int64_col", "int64_too", "float64_col"] + aggregations = [ + "sum", + "mean", + "median", + "std", + "var", + "min", + "max", + "nunique", + "count", + ] + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = scalars_df[numeric_cols].agg(aggregations) + pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + # Drop median, as it's an approximation. + bf_median = bf_result.loc["median", :] + bf_result = bf_result.drop(labels=["median"]) + pd_result = pd_result.drop(labels=["median"]) + + assert_dfs_equivalent(pd_result, bf_result, check_index_type=False) + + # Double-check that median is at least plausible. + assert ( + (bf_result.loc["min", :] <= bf_median) & (bf_median <= bf_result.loc["max", :]) + ).all() + + +def test_dataframe_agg_int_multi_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "bool_col"] + aggregations = [ + "sum", + "nunique", + "count", + "size", + ] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) + + for dtype in bf_result.dtypes: + assert dtype == "Int64" + + # Pandas may produce narrower numeric types + # Pandas has object index type + pd.testing.assert_frame_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_transpose(): + # Include some floats to ensure type coercion + values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] + # Test complex case of both axes being multi-indices with non-unique elements + + columns: pandas.Index = pd.Index( + ["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow") + ) + columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"]) + + index: pandas.Index = pd.Index( + ["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow") + ) + rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) + + pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) + bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) + + pd_result = pd_df.T + bf_result = bf_df.T.to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + +def test_df_transpose_error(): + with pytest.raises(TypeError, match="Cannot coerce.*to a common type."): + dataframe.DataFrame([[1, "hello"], [2, "world"]]).transpose() + + +def test_df_transpose_repeated_uses_cache(): + bf_df = dataframe.DataFrame([[1, 2.5], [2, 3.5]]) + pd_df = pandas.DataFrame([[1, 2.5], [2, 3.5]]) + # Transposing many times so that operation will fail from complexity if not using cache + for i in range(10): + # Cache still works even with simple scalar binop + bf_df = bf_df.transpose() + i + pd_df = pd_df.transpose() + i + + pd.testing.assert_frame_equal( + pd_df, bf_df.to_pandas(), check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_stack(scalars_dfs, ordered): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].stack().to_pandas(ordered=ordered) + pd_result = scalars_pandas_df[columns].stack(future_stack=True) + + # Pandas produces NaN, where bq dataframes produces pd.NA + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) + + +def test_df_melt_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].melt().to_pandas() + pd_result = scalars_pandas_df[columns].melt() + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + +def test_df_melt_parameterized(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + + bf_result = scalars_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ).to_pandas() + pd_result = scalars_pandas_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ) + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_unstack(scalars_dfs, ordered): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = [ + "rowindex_2", + "int64_col", + "int64_too", + ] + + # unstack on mono-index produces series + bf_result = scalars_df[columns].unstack().to_pandas(ordered=ordered) + pd_result = scalars_pandas_df[columns].unstack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) + + +@pytest.mark.parametrize( + ("values", "index", "columns"), + [ + ("int64_col", "int64_too", ["string_col"]), + (["int64_col"], "int64_too", ["string_col"]), + (["int64_col", "float64_col"], "int64_too", ["string_col"]), + ], +) +def test_df_pivot(scalars_dfs, values, index, columns): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.pivot( + values=values, index=index, columns=columns + ).to_pandas() + pd_result = scalars_pandas_df.pivot(values=values, index=index, columns=columns) + + # Pandas produces NaN, where bq dataframes produces pd.NA + bf_result = bf_result.fillna(float("nan")) + pd_result = pd_result.fillna(float("nan")) + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("values", "index", "columns"), + [ + (["goals", "assists"], ["team_name", "season"], ["position"]), + (["goals", "assists"], ["season"], ["team_name", "position"]), + ], +) +def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): + bf_result = ( + hockey_df.reset_index() + .pivot(values=values, index=index, columns=columns) + .to_pandas() + ) + pd_result = hockey_pandas_df.reset_index().pivot( + values=values, index=index, columns=columns + ) + + # Pandas produces NaN, where bq dataframes produces pd.NA + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("values", "index", "columns", "aggfunc"), + [ + (("culmen_length_mm", "body_mass_g"), "species", "sex", "std"), + (["body_mass_g", "culmen_length_mm"], ("species", "island"), "sex", "sum"), + ("body_mass_g", "sex", ["island", "species"], "mean"), + ("culmen_depth_mm", "island", "species", "max"), + ], +) +def test_df_pivot_table( + penguins_df_default_index, + penguins_pandas_df_default_index, + values, + index, + columns, + aggfunc, +): + bf_result = penguins_df_default_index.pivot_table( + values=values, index=index, columns=columns, aggfunc=aggfunc + ).to_pandas() + pd_result = penguins_pandas_df_default_index.pivot_table( + values=values, index=index, columns=columns, aggfunc=aggfunc + ) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_column_type=False + ) + + +def test_ipython_key_completions_with_drop(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = "string_col" + bf_dataframe = scalars_df.drop(columns=col_names) + pd_dataframe = scalars_pandas_df.drop(columns=col_names) + expected = pd_dataframe.columns.tolist() + + results = bf_dataframe._ipython_key_completions_() + + assert col_names not in results + assert results == expected + # _ipython_key_completions_ is called with square brackets + # so only column names are relevant with tab completion + assert "to_gbq" not in results + assert "merge" not in results + assert "drop" not in results + + +def test_ipython_key_completions_with_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"string_col": "a_renamed_column"} + bf_dataframe = scalars_df.rename(columns=col_name_dict) + pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) + expected = pd_dataframe.columns.tolist() + + results = bf_dataframe._ipython_key_completions_() + + assert "string_col" not in results + assert "a_renamed_column" in results + assert results == expected + # _ipython_key_completions_ is called with square brackets + # so only column names are relevant with tab completion + assert "to_gbq" not in results + assert "merge" not in results + assert "drop" not in results + + +def test__dir__with_drop(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = "string_col" + bf_dataframe = scalars_df.drop(columns=col_names) + pd_dataframe = scalars_pandas_df.drop(columns=col_names) + expected = pd_dataframe.columns.tolist() + + results = dir(bf_dataframe) + + assert col_names not in results + assert frozenset(expected) <= frozenset(results) + # __dir__ is called with a '.' and displays all methods, columns names, etc. + assert "to_gbq" in results + assert "merge" in results + assert "drop" in results + + +def test__dir__with_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"string_col": "a_renamed_column"} + bf_dataframe = scalars_df.rename(columns=col_name_dict) + pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) + expected = pd_dataframe.columns.tolist() + + results = dir(bf_dataframe) + + assert "string_col" not in results + assert "a_renamed_column" in results + assert frozenset(expected) <= frozenset(results) + # __dir__ is called with a '.' and displays all methods, columns names, etc. + assert "to_gbq" in results + assert "merge" in results + assert "drop" in results + + +def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index[["int64_col", "int64_col", "int64_too"]].to_pandas() + pd_result = scalars_pandas_df_index[["int64_col", "int64_col", "int64_too"]] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (0, 0, None), + (None, None, None), + (1, None, None), + (None, 4, None), + (None, None, 2), + (None, 50000000000, 1), + (5, 4, None), + (3, None, 2), + (1, 7, 2), + (1, 7, 50000000000), + ], +) +def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): + bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() + pd_result = scalars_pandas_df_index.iloc[start:stop:step] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (0, 0, None), + ], +) +def test_iloc_slice_after_cache( + scalars_df_index, scalars_pandas_df_index, start, stop, step +): + scalars_df_index.cache() + bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() + pd_result = scalars_pandas_df_index.iloc[start:stop:step] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_iloc_slice_zero_step(scalars_df_index): + with pytest.raises(ValueError): + scalars_df_index.iloc[0:0:0] + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): + bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) + pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +@pytest.mark.parametrize( + "index", + [0, 5, -2, (2,)], +) +def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + "index", + [(2, 5), (5, 0), (0, 0)], +) +def test_iloc_tuple(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + "index", + [(slice(None), [1, 2, 3]), (slice(1, 7, 2), [2, 5, 3])], +) +def test_iloc_tuple_multi_columns(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index].to_pandas() + pd_result = scalars_pandas_df_index.iloc[index] + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_iloc_tuple_multi_columns_single_row(scalars_df_index, scalars_pandas_df_index): + index = (2, [2, 1, 3, -4]) + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("index", "error"), + [ + ((1, 1, 1), pd.errors.IndexingError), + (("asd", "asd", "asd"), pd.errors.IndexingError), + (("asd"), TypeError), + ], +) +def test_iloc_tuple_errors(scalars_df_index, scalars_pandas_df_index, index, error): + with pytest.raises(error): + scalars_df_index.iloc[index] + with pytest.raises(error): + scalars_pandas_df_index.iloc[index] + + +@pytest.mark.parametrize( + "index", + [(2, 5), (5, 0), (0, 0)], +) +def test_iat(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iat[index] + pd_result = scalars_pandas_df_index.iat[index] + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("index", "error"), + [ + (0, TypeError), + ("asd", ValueError), + ((1, 2, 3), TypeError), + (("asd", "asd"), ValueError), + ], +) +def test_iat_errors(scalars_df_index, scalars_pandas_df_index, index, error): + with pytest.raises(error): + scalars_pandas_df_index.iat[index] + with pytest.raises(error): + scalars_df_index.iat[index] + + +def test_iloc_single_integer_out_of_bound_error(scalars_df_index): + with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): + scalars_df_index.iloc[99] + + +def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[scalars_df_index.bool_col].to_pandas() + pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index.bool_col] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_list_select_rows_and_columns(scalars_df_index, scalars_pandas_df_index): + idx_list = [0, 3, 5] + bf_result = scalars_df_index.loc[idx_list, ["bool_col", "int64_col"]].to_pandas() + pd_result = scalars_pandas_df_index.loc[idx_list, ["bool_col", "int64_col"]] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, "int64_col"].to_pandas() + pd_result = scalars_pandas_df_index.loc[:, "int64_col"] + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, scalars_df_index.dtypes == "Int64"].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.dtypes == "Int64" + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_with_column_condition_bf_series( + scalars_df_index, scalars_pandas_df_index +): + # (b/347072677) GEOGRAPH type doesn't support DISTINCT op + columns = [ + item for item in scalars_pandas_df_index.columns if item != "geography_col" + ] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + size_half = len(scalars_pandas_df_index) / 2 + bf_result = scalars_df_index.loc[ + :, scalars_df_index.nunique() > size_half + ].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.nunique() > size_half + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.loc[index] + pd_result = scalars_pandas_df_index.loc[index] + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.loc[index] + pd_result = scalars_pandas_df_index.loc[index] + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_at_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.at[index, "int64_too"] + pd_result = scalars_pandas_df_index.at[index, "int64_too"] + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_at_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.at[index, "string_col"] + pd_result = scalars_pandas_df_index.at[index, "string_col"] + assert bf_result == pd_result + + +def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["int64_too"] == 0, "new_col"] = 99 + pd_df.loc[pd_df["int64_too"] == 0, "new_col"] = 99 + + # pandas uses float64 instead + pd_df["new_col"] = pd_df["new_col"].astype("Float64") + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), + pd_df, + ) + + +@pytest.mark.parametrize( + ("col", "value"), + [ + ("string_col", "hello"), + ("int64_col", 3), + ("float64_col", 3.5), + ], +) +def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs, col, value): + if pd.__version__.startswith("1."): + pytest.skip("this loc overload not supported in pandas 1.x.") + + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["int64_too"] == 1, col] = value + pd_df.loc[pd_df["int64_too"] == 1, col] = value + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), + pd_df, + ) + + +def test_loc_setitem_bool_series_scalar_error(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("this loc overload not supported in pandas 1.x.") + + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(Exception): + bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = 99 + with pytest.raises(Exception): + pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99 + + +@pytest.mark.parametrize( + ("col", "op"), + [ + # Int aggregates + pytest.param("int64_col", lambda x: x.sum(), id="int-sum"), + pytest.param("int64_col", lambda x: x.min(), id="int-min"), + pytest.param("int64_col", lambda x: x.max(), id="int-max"), + pytest.param("int64_col", lambda x: x.count(), id="int-count"), + pytest.param("int64_col", lambda x: x.nunique(), id="int-nunique"), + # Float aggregates + pytest.param("float64_col", lambda x: x.count(), id="float-count"), + pytest.param("float64_col", lambda x: x.nunique(), id="float-nunique"), + # Bool aggregates + pytest.param("bool_col", lambda x: x.sum(), id="bool-sum"), + pytest.param("bool_col", lambda x: x.count(), id="bool-count"), + pytest.param("bool_col", lambda x: x.nunique(), id="bool-nunique"), + # String aggregates + pytest.param("string_col", lambda x: x.count(), id="string-count"), + pytest.param("string_col", lambda x: x.nunique(), id="string-nunique"), + ], +) +def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, op): + bf_result = op(scalars_df_index[[col]]).to_pandas() + pd_result = op(scalars_pandas_df_index[[col]]) + + # Check dtype separately + assert bf_result.dtype == "Int64" + # Is otherwise "object" dtype + pd_result.index = pd_result.index.astype("string[pyarrow]") + # Pandas may produce narrower numeric types + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +@pytest.mark.parametrize( + ("col", "op"), + [ + pytest.param("bool_col", lambda x: x.min(), id="bool-min"), + pytest.param("bool_col", lambda x: x.max(), id="bool-max"), + ], +) +def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col, op): + bf_result = op(scalars_df_index[[col]]).to_pandas() + pd_result = op(scalars_pandas_df_index[[col]]) + + # Check dtype separately + assert bf_result.dtype == "boolean" + + # Pandas may produce narrower numeric types + # Pandas has object index type + pd_result.index = pd_result.index.astype("string[pyarrow]") + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +@pytest.mark.parametrize( + ("op", "bf_dtype"), + [ + (lambda x: x.sum(numeric_only=True), "Float64"), + (lambda x: x.mean(numeric_only=True), "Float64"), + (lambda x: x.min(numeric_only=True), "Float64"), + (lambda x: x.max(numeric_only=True), "Float64"), + (lambda x: x.std(numeric_only=True), "Float64"), + (lambda x: x.var(numeric_only=True), "Float64"), + (lambda x: x.count(numeric_only=False), "Int64"), + (lambda x: x.nunique(), "Int64"), + ], + ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], +) +def test_dataframe_aggregates(scalars_dfs_maybe_ordered, op, bf_dtype): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered + col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] + bf_series = op(scalars_df_index[col_names]) + bf_result = bf_series + pd_result = op(scalars_pandas_df_index[col_names]) + + # Check dtype separately + assert bf_result.dtype == bf_dtype + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + # Pandas has object index type + pd_result.index = pd_result.index.astype("string[pyarrow]") + assert_series_equivalent( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.sum(axis=1, numeric_only=True)), + (lambda x: x.mean(axis=1, numeric_only=True)), + (lambda x: x.min(axis=1, numeric_only=True)), + (lambda x: x.max(axis=1, numeric_only=True)), + (lambda x: x.std(axis=1, numeric_only=True)), + (lambda x: x.var(axis=1, numeric_only=True)), + ], + ids=["sum", "mean", "min", "max", "std", "var"], +) +def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index, op): + col_names = ["int64_too", "int64_col", "float64_col", "bool_col", "string_col"] + bf_result = op(scalars_df_index[col_names]).to_pandas() + pd_result = op(scalars_pandas_df_index[col_names]) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + # Pandas has object index type + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] + bf_result = scalars_df_index[col_names].median(numeric_only=True).to_pandas() + pd_result = scalars_pandas_df_index[col_names].agg(["min", "max"]) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + # Median is an approximation, but double-check that median is plausible. + for col in col_names: + assert (pd_result.loc["min", col] <= bf_result[col]) and ( + bf_result[col] <= pd_result.loc["max", col] + ) + + +def test_dataframe_aggregates_quantile_mono(scalars_df_index, scalars_pandas_df_index): + q = 0.45 + col_names = ["int64_too", "int64_col", "float64_col"] + bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() + pd_result = scalars_pandas_df_index[col_names].quantile(q=q) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df_index): + q = [0, 0.33, 0.67, 1.0] + col_names = ["int64_too", "int64_col", "float64_col"] + bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() + pd_result = scalars_pandas_df_index[col_names].quantile(q=q) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + pd_result.index = pd_result.index.astype("Float64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.all(bool_only=True)), + (lambda x: x.any(bool_only=True)), + (lambda x: x.all(axis=1, bool_only=True)), + (lambda x: x.any(axis=1, bool_only=True)), + ], + ids=["all_axis0", "any_axis0", "all_axis1", "any_axis1"], +) +def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op): + # Pandas will drop nullable 'boolean' dtype so we convert first to bool, then cast back later + scalars_df_index = scalars_df_index.assign( + bool_col=scalars_df_index.bool_col.fillna(False) + ) + scalars_pandas_df_index = scalars_pandas_df_index.assign( + bool_col=scalars_pandas_df_index.bool_col.fillna(False).astype("bool") + ) + bf_series = op(scalars_df_index) + pd_series = op(scalars_pandas_df_index).astype("boolean") + bf_result = bf_series.to_pandas() + + pd_series.index = pd_series.index.astype(bf_result.index.dtype) + pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + + +def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col"] + bf_series = scalars_df_index[col_names].prod() + pd_series = scalars_pandas_df_index[col_names].prod() + bf_result = bf_series.to_pandas() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_series = pd_series.astype("Float64") + # Pandas has object index type + pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + + +def test_df_skew_too_few_values(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].head(2).skew().to_pandas() + pd_result = scalars_pandas_df[columns].head(2).skew() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_skew(scalars_dfs, ordered): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].skew().to_pandas(ordered=ordered) + pd_result = scalars_pandas_df[columns].skew() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + assert_series_equal( + pd_result, bf_result, check_index_type=False, ignore_order=not ordered + ) + + +def test_df_kurt_too_few_values(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].head(2).kurt().to_pandas() + pd_result = scalars_pandas_df[columns].head(2).kurt() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_df_kurt(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].kurt().to_pandas() + pd_result = scalars_pandas_df[columns].kurt() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("frac", "n", "random_state"), + [ + (None, 4, None), + (0.5, None, None), + (None, 4, 10), + (0.5, None, 10), + (None, None, None), + ], + ids=[ + "n_wo_random_state", + "frac_wo_random_state", + "n_w_random_state", + "frac_w_random_state", + "n_default", + ], +) +def test_sample(scalars_dfs, frac, n, random_state): + scalars_df, _ = scalars_dfs + df = scalars_df.sample(frac=frac, n=n, random_state=random_state) + bf_result = df.to_pandas() + + n = 1 if n is None else n + expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n + assert bf_result.shape[0] == expected_sample_size + assert bf_result.shape[1] == scalars_df.shape[1] + + +def test_sample_determinism(penguins_df_default_index): + df = penguins_df_default_index.sample(n=100, random_state=12345).head(15) + bf_result = df.to_pandas() + bf_result2 = df.to_pandas() + + pandas.testing.assert_frame_equal(bf_result, bf_result2) + + +def test_sample_raises_value_error(scalars_dfs): + scalars_df, _ = scalars_dfs + with pytest.raises( + ValueError, match="Only one of 'n' or 'frac' parameter can be specified." + ): + scalars_df.sample(frac=0.5, n=4) + + +def test_sample_args_sort(scalars_dfs): + scalars_df, _ = scalars_dfs + index = [4, 3, 2, 5, 1, 0] + scalars_df = scalars_df.iloc[index] + + kwargs = {"frac": 1.0, "random_state": 333} + + df = scalars_df.sample(**kwargs).to_pandas() + assert df.index.values != index + assert df.index.values != sorted(index) + + df = scalars_df.sample(sort="random", **kwargs).to_pandas() + assert df.index.values != index + assert df.index.values != sorted(index) + + df = scalars_df.sample(sort=True, **kwargs).to_pandas() + assert df.index.values == sorted(index) + + df = scalars_df.sample(sort=False, **kwargs).to_pandas() + assert df.index.values == index + + +@pytest.mark.parametrize( + ("axis",), + [ + (None,), + (0,), + (1,), + ], +) +def test_df_add_prefix(scalars_df_index, scalars_pandas_df_index, axis): + if pd.__version__.startswith("1."): + pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") + bf_result = scalars_df_index.add_prefix("prefix_", axis).to_pandas() + + pd_result = scalars_pandas_df_index.add_prefix("prefix_", axis) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("axis",), + [ + (0,), + (1,), + ], +) +def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): + if pd.__version__.startswith("1."): + pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") + bf_result = scalars_df_index.add_suffix("_suffix", axis).to_pandas() + + pd_result = scalars_pandas_df_index.add_suffix("_suffix", axis) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +def test_df_astype_error_error(session): + input = pd.DataFrame(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + +def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index.filter(items=["string_col", "int64_col"]).to_pandas() + + pd_result = scalars_pandas_df_index.filter(items=["string_col", "int64_col"]) + # Ignore column ordering as pandas order differently depending on version + pd.testing.assert_frame_equal( + bf_result.sort_index(axis=1), + pd_result.sort_index(axis=1), + ) + + +def test_df_columns_filter_like(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.filter(like="64_col").to_pandas() + + pd_result = scalars_pandas_df_index.filter(like="64_col") + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_columns_filter_regex(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.filter(regex="^[^_]+$").to_pandas() + + pd_result = scalars_pandas_df_index.filter(regex="^[^_]+$") + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index.filter(items=[5, 1, 3], axis=0).to_pandas() + + pd_result = scalars_pandas_df_index.filter(items=[5, 1, 3], axis=0) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + # Ignore ordering as pandas order differently depending on version + assert_pandas_df_equal( + bf_result, + pd_result, + ignore_order=True, + check_names=False, + ) + + +def test_df_rows_filter_like(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index.filter(like="ello", axis=0).to_pandas() + + pd_result = scalars_pandas_df_index.filter(like="ello", axis=0) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_rows_filter_regex(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index.filter(regex="^[GH].*", axis=0).to_pandas() + + pd_result = scalars_pandas_df_index.filter(regex="^[GH].*", axis=0) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_rows_list(scalars_dfs_maybe_ordered): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered + bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]) + + pd_result = scalars_pandas_df_index.reindex(index=[5, 1, 3, 99, 1]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + assert_dfs_equivalent( + pd_result, + bf_result, + ) + + +def test_df_reindex_rows_index(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex( + index=pd.Index([5, 1, 3, 99, 1], name="newname") + ).to_pandas() + + pd_result = scalars_pandas_df_index.reindex( + index=pd.Index([5, 1, 3, 99, 1], name="newname") + ) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_nonunique(scalars_df_index): + with pytest.raises(ValueError): + # int64_too is non-unique + scalars_df_index.set_index("int64_too").reindex( + index=[5, 1, 3, 99, 1], validate=True + ) + + +def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"] + ).to_pandas() + + pd_result = scalars_pandas_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"] + ) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_columns_with_same_order(scalars_df_index, scalars_pandas_df_index): + # First, make sure the two dataframes have the same columns in order. + columns = ["int64_col", "int64_too"] + bf = scalars_df_index[columns] + pd_df = scalars_pandas_df_index[columns] + + bf_result = bf.reindex(columns=columns).to_pandas() + pd_result = pd_df.reindex(columns=columns) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index): + unsupported = [ + "geography_col", + ] + scalars_df_index = scalars_df_index.drop(columns=unsupported) + scalars_pandas_df_index = scalars_pandas_df_index.drop(columns=unsupported) + + bf_result = scalars_df_index.equals(scalars_df_index) + pd_result = scalars_pandas_df_index.equals(scalars_pandas_df_index) + + assert pd_result == bf_result + + +def test_df_equals_series(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index[["int64_col"]].equals(scalars_df_index["int64_col"]) + pd_result = scalars_pandas_df_index[["int64_col"]].equals( + scalars_pandas_df_index["int64_col"] + ) + + assert pd_result == bf_result + + +def test_df_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + bf_modified = scalars_df_index.copy() + bf_modified = bf_modified.astype("Float64") + + pd_modified = scalars_pandas_df_index.copy() + pd_modified = pd_modified.astype("Float64") + + bf_result = scalars_df_index.equals(bf_modified) + pd_result = scalars_pandas_df_index.equals(pd_modified) + + assert pd_result == bf_result + + +def test_df_equals_different_values(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + bf_modified = scalars_df_index.copy() + bf_modified["int64_col"] = bf_modified.int64_col + 1 + + pd_modified = scalars_pandas_df_index.copy() + pd_modified["int64_col"] = pd_modified.int64_col + 1 + + bf_result = scalars_df_index.equals(bf_modified) + pd_result = scalars_pandas_df_index.equals(pd_modified) + + assert pd_result == bf_result + + +def test_df_equals_extra_column(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + more_columns = ["int64_col", "int64_too", "float64_col"] + + bf_result = scalars_df_index[columns].equals(scalars_df_index[more_columns]) + pd_result = scalars_pandas_df_index[columns].equals( + scalars_pandas_df_index[more_columns] + ) + + assert pd_result == bf_result + + +def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index): + reindex_target_bf = scalars_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] + ) + bf_result = scalars_df_index.reindex_like(reindex_target_bf).to_pandas() + + reindex_target_pd = scalars_pandas_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] + ) + pd_result = scalars_pandas_df_index.reindex_like(reindex_target_pd) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_values(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.values + + pd_result = scalars_pandas_df_index.values + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +def test_df_to_numpy(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_numpy() + + pd_result = scalars_pandas_df_index.to_numpy() + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +def test_df___array__(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.__array__() + + pd_result = scalars_pandas_df_index.__array__() + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +@pytest.mark.parametrize( + ("key",), + [ + ("hello",), + (2,), + ("int64_col",), + (None,), + ], +) +def test_df_contains(scalars_df_index, scalars_pandas_df_index, key): + bf_result = key in scalars_df_index + pd_result = key in scalars_pandas_df_index + + assert bf_result == pd_result + + +def test_df_getattr_attribute_error_when_pandas_has(scalars_df_index): + # swapaxes is implemented in pandas but not in bigframes + with pytest.raises(AttributeError): + scalars_df_index.swapaxes() + + +def test_df_getattr_attribute_error(scalars_df_index): + with pytest.raises(AttributeError): + scalars_df_index.not_a_method() + + +def test_df_getattr_axes(): + df = dataframe.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + assert isinstance(df.index, bigframes.core.indexes.Index) + assert isinstance(df.columns, pandas.Index) + assert isinstance(df.my_column, series.Series) + + +def test_df_setattr_index(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + + pd_df.index = pandas.Index([4, 5]) + bf_df.index = [4, 5] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_df_setattr_columns(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + + pd_df.columns = typing.cast(pandas.Index, pandas.Index([4, 5, 6])) + + bf_df.columns = pandas.Index([4, 5, 6]) + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_df_setattr_modify_column(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + pd_df.my_column = [4, 5] + bf_df.my_column = [4, 5] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): + index_list = scalars_pandas_df_index.string_col.iloc[[0, 1, 1, 5]].values + + scalars_df_index = scalars_df_index.set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") + + bf_result = scalars_df_index.loc[index_list].to_pandas() + pd_result = scalars_pandas_df_index.loc[index_list] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): + index_list = [3, 2, 1, 3, 2, 1] + + bf_result = scalars_df_index.loc[index_list] + pd_result = scalars_pandas_df_index.loc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_multiindex(scalars_dfs_maybe_ordered): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + index_list = [("Hello, World!", -234892), ("Hello, World!", 123456789)] + + bf_result = scalars_df_multiindex.loc[index_list] + pd_result = scalars_pandas_df_multiindex.loc[index_list] + + assert_dfs_equivalent( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + "index_list", + [ + [0, 1, 2, 3, 4, 4], + [0, 0, 0, 5, 4, 7, -2, -5, 3], + [-1, -2, -3, -4, -5, -5], + ], +) +def test_iloc_list(scalars_df_index, scalars_pandas_df_index, index_list): + bf_result = scalars_df_index.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +@pytest.mark.parametrize( + "index_list", + [ + [0, 1, 2, 3, 4, 4], + [0, 0, 0, 5, 4, 7, -2, -5, 3], + [-1, -2, -3, -4, -5, -5], + ], +) +def test_iloc_list_partial_ordering( + scalars_df_partial_ordering, scalars_pandas_df_index, index_list +): + bf_result = scalars_df_partial_ordering.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_list_multiindex(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + + index_list = [0, 0, 0, 5, 4, 7] + + bf_result = scalars_df.iloc[index_list] + pd_result = scalars_pandas_df.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): + + index_list: List[int] = [] + + bf_result = scalars_df_index.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + bf_result = bf_result.to_pandas() + assert bf_result.shape == pd_result.shape # types are known to be different + + +def test_rename_axis(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.rename_axis("newindexname") + pd_result = scalars_pandas_df_index.rename_axis("newindexname") + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_axis_nonstring(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.rename_axis((4,)) + pd_result = scalars_pandas_df_index.rename_axis((4,)) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_index = scalars_df_index.set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") + + bf_result = scalars_df_index.loc[bf_string_series] + pd_result = scalars_pandas_df_index.loc[pd_string_series] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + + bf_result = scalars_df_multiindex.loc[bf_string_series] + pd_result = scalars_pandas_df_multiindex.loc[pd_string_series] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): + pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index + bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index + + bf_result = scalars_df_index.loc[bf_index] + pd_result = scalars_pandas_df_index.loc[pd_index] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_index_integer_index_renamed_col( + scalars_df_index, scalars_pandas_df_index +): + scalars_df_index = scalars_df_index.rename(columns={"int64_col": "rename"}) + scalars_pandas_df_index = scalars_pandas_df_index.rename( + columns={"int64_col": "rename"} + ) + + pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index + bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index + + bf_result = scalars_df_index.loc[bf_index] + pd_result = scalars_pandas_df_index.loc[pd_index] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +@pytest.mark.parametrize( + ("subset"), + [ + None, + "bool_col", + ["bool_col", "int64_too"], + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, subset): + columns = ["bool_col", "int64_too", "int64_col"] + bf_df = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() + pd_df = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) + pd.testing.assert_frame_equal( + pd_df, + bf_df, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_df_drop_duplicates_w_json(json_df, keep): + bf_df = json_df.drop_duplicates(keep=keep).to_pandas() + + # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible + # with Arrow string extension types. Temporary conversion to standard Pandas + # strings is required. + json_pandas_df = json_df.to_pandas() + json_pandas_df["json_col"] = json_pandas_df["json_col"].astype( + pd.StringDtype(storage="pyarrow") + ) + + pd_df = json_pandas_df.drop_duplicates(keep=keep) + pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE) + pd.testing.assert_frame_equal( + pd_df, + bf_df, + ) + + +@pytest.mark.parametrize( + ("subset"), + [ + None, + ["bool_col"], + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): + columns = ["bool_col", "int64_too", "int64_col"] + bf_series = scalars_df_index[columns].duplicated(subset, keep=keep).to_pandas() + pd_series = scalars_pandas_df_index[columns].duplicated(subset, keep=keep) + pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) + + +def test_df_from_dict_columns_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="columns") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_index_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict( + data, orient="index", columns=["col1", "col2"] + ).to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_tight_orient(): + data = { + "index": [("i1", "i2"), ("i3", "i4")], + "columns": ["col1", "col2"], + "data": [[1, 2.6], [3, 4.5]], + "index_names": ["in1", "in2"], + "column_names": ["column_axis"], + } + + bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="tight") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_records(): + records = ((1, "a"), (2.5, "b"), (3.3, "c"), (4.9, "d")) + + bf_result = dataframe.DataFrame.from_records( + records, columns=["c1", "c2"] + ).to_pandas() + pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + bf_result = scalars_df_index.drop(columns=unsupported).to_dict() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_dict() + + assert bf_result == pd_result + + +def test_df_to_excel(scalars_df_index, scalars_pandas_df_index): + unsupported = ["timestamp_col"] + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.drop(columns=unsupported).to_excel(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).to_excel(pd_result_file) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_latex(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + bf_result = scalars_df_index.drop(columns=unsupported).to_latex() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_latex() + + assert bf_result == pd_result + + +def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_json() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_json(default_handler=str) + + assert bf_result == pd_result + + +def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + # duration not fully supported at pandas level + scalars_df_index = scalars_df_index.drop(columns="duration_col") + scalars_pandas_df_index = scalars_pandas_df_index.drop(columns="duration_col") + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_json(bf_result_file, orient="table") + # default_handler for arrow types that have no default conversion + scalars_pandas_df_index.to_json( + pd_result_file, orient="table", default_handler=str + ) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_csv() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_csv() + + assert bf_result == pd_result + + +def test_df_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_csv(bf_result_file) + scalars_pandas_df_index.to_csv(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_bytes(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + + bf_result = scalars_df_index.drop(columns=unsupported).to_parquet() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_parquet() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_file(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.drop(columns=unsupported).to_parquet(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).to_parquet(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_records(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] + bf_result = scalars_df_index.drop(columns=unsupported).to_records() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_records() + + for bfi, pdi in zip(bf_result, pd_result): + for bfj, pdj in zip(bfi, pdi): + assert pd.isna(bfj) and pd.isna(pdj) or bfj == pdj + + +def test_df_to_string(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_string() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_string() + + assert bf_result == pd_result + + +def test_df_to_html(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_html() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_html() + + assert bf_result == pd_result + + +def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index): + # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231 + bf_result = scalars_df_index.dropna().to_markdown() + pd_result = scalars_pandas_df_index.dropna().to_markdown() + + assert bf_result == pd_result + + +def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_pickle(bf_result_file) + scalars_pandas_df_index.to_pickle(pd_result_file) + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): + unsupported = [ + "numeric_col", + "bytes_col", + "date_col", + "datetime_col", + "time_col", + "timestamp_col", + "geography_col", + "duration_col", + ] + + bf_result_file = tempfile.TemporaryFile() + pd_result_file = tempfile.TemporaryFile() + scalars_df_index.drop(columns=unsupported).to_orc(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).reset_index().to_orc( + pd_result_file + ) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("expr",), + [ + ("new_col = int64_col + int64_too",), + ("new_col = (rowindex > 3) | bool_col",), + ("int64_too = bool_col\nnew_col2 = rowindex",), + ], +) +def test_df_eval(scalars_dfs, expr): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.eval(expr).to_pandas() + pd_result = scalars_pandas_df.eval(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("expr",), + [ + ("int64_col > int64_too",), + ("bool_col",), + ("((int64_col - int64_too) % @local_var) == 0",), + ], +) +def test_df_query(scalars_dfs, expr): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + # local_var is referenced in expressions + local_var = 3 # NOQA + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.query(expr).to_pandas() + pd_result = scalars_pandas_df.query(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("subset", "normalize", "ascending", "dropna"), + [ + (None, False, False, False), + (None, True, True, True), + ("bool_col", True, False, True), + ], +) +def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = ( + scalars_df[["string_col", "bool_col"]] + .value_counts(subset, normalize=normalize, ascending=ascending, dropna=dropna) + .to_pandas() + ) + pd_result = scalars_pandas_df[["string_col", "bool_col"]].value_counts( + subset, normalize=normalize, ascending=ascending, dropna=dropna + ) + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("na_option", "method", "ascending", "numeric_only", "pct"), + [ + ("keep", "average", True, True, True), + ("top", "min", False, False, False), + ("bottom", "max", False, False, True), + ("top", "first", False, False, False), + ("bottom", "dense", False, False, True), + ], +) +def test_df_rank_with_nulls( + scalars_df_index, + scalars_pandas_df_index, + na_option, + method, + ascending, + numeric_only, + pct, +): + unsupported_columns = ["geography_col"] + bf_result = ( + scalars_df_index.drop(columns=unsupported_columns) + .rank( + na_option=na_option, + method=method, + ascending=ascending, + numeric_only=numeric_only, + pct=pct, + ) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index.drop(columns=unsupported_columns) + .rank( + na_option=na_option, + method=method, + ascending=ascending, + numeric_only=numeric_only, + pct=pct, + ) + .astype(pd.Float64Dtype()) + ) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_bool_interpretation_error(scalars_df_index): + with pytest.raises(ValueError): + True if scalars_df_index else False + + +def test_query_job_setters(scalars_df_default_index: dataframe.DataFrame): + # if allow_large_results=False, might not create query job + with bigframes.option_context("compute.allow_large_results", True): + job_ids = set() + repr(scalars_df_default_index) + assert scalars_df_default_index.query_job is not None + job_ids.add(scalars_df_default_index.query_job.job_id) + scalars_df_default_index.to_pandas(allow_large_results=True) + job_ids.add(scalars_df_default_index.query_job.job_id) + + assert len(job_ids) == 2 + + +def test_df_cached(scalars_df_index): + df = scalars_df_index.set_index(["int64_too", "int64_col"]).sort_values( + "string_col" + ) + df = df[df["rowindex_2"] % 2 == 0] + + df_cached_copy = df.cache() + pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) + + +def test_df_cached_many_index_cols(scalars_df_index): + index_cols = [ + "int64_too", + "time_col", + "int64_col", + "bool_col", + "date_col", + "timestamp_col", + "string_col", + ] + df = scalars_df_index.set_index(index_cols) + df = df[df["rowindex_2"] % 2 == 0] + + df_cached_copy = df.cache() + pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) + + +def test_assign_after_binop_row_joins(): + pd_df = pd.DataFrame( + { + "idx1": [1, 1, 1, 1, 2, 2, 2, 2], + "idx2": [10, 10, 20, 20, 10, 10, 20, 20], + "metric1": [10, 14, 2, 13, 6, 2, 9, 5], + "metric2": [25, -3, 8, 2, -1, 0, 0, -4], + }, + dtype=pd.Int64Dtype(), + ).set_index(["idx1", "idx2"]) + bf_df = dataframe.DataFrame(pd_df) + + # Expect implicit joiner to be used, preserving input cardinality rather than getting relational join + bf_df["metric_diff"] = bf_df.metric1 - bf_df.metric2 + pd_df["metric_diff"] = pd_df.metric1 - pd_df.metric2 + + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + +def test_df_cache_with_implicit_join(scalars_df_index): + """expectation is that cache will be used, but no explicit join will be performed""" + df = scalars_df_index[["int64_col", "int64_too"]].sort_index().reset_index() + 3 + df.cache() + bf_result = df + (df * 2) + sql = bf_result.sql + + # Very crude asserts, want sql to not use join and not use base table, only reference cached table + assert "JOIN" not in sql + assert "bigframes_testing" not in sql + + +def test_df_dot_inline(session): + df1 = pd.DataFrame([[1, 2, 3], [2, 5, 7]]) + df2 = pd.DataFrame([[2, 4, 8], [1, 5, 10], [3, 6, 9]]) + + bf1 = session.read_pandas(df1) + bf2 = session.read_pandas(df2) + bf_result = bf1.dot(bf2).to_pandas() + pd_result = df1.dot(df2) + + # Patch pandas dtypes for testing parity + # Pandas uses int64 instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_dot( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = matrix_2by3_df.dot(matrix_3by4_df).to_pandas() + pd_result = matrix_2by3_pandas_df.dot(matrix_3by4_pandas_df) + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_dot_operator( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_dot_series_inline(): + left = [[1, 2, 3], [2, 5, 7]] + right = [2, 1, 3] + + bf1 = dataframe.DataFrame(left) + bf2 = series.Series(right) + bf_result = bf1.dot(bf2).to_pandas() + + df1 = pd.DataFrame(left) + df2 = pd.Series(right) + pd_result = df1.dot(df2) + + # Patch pandas dtypes for testing parity + # Pandas result is int64 instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_df_dot_series( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = matrix_2by3_df.dot(matrix_3by4_df["x"]).to_pandas() + pd_result = matrix_2by3_pandas_df.dot(matrix_3by4_pandas_df["x"]) + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_df_dot_operator_series( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df["x"]).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df["x"] + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +# TODO(tswast): We may be able to re-enable this test after we break large +# queries up in https://github.com/googleapis/python-bigquery-dataframes/pull/427 +@pytest.mark.skipif( + sys.version_info >= (3, 12), + # See: https://github.com/python/cpython/issues/112282 + reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", +) +def test_recursion_limit(scalars_df_index): + scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + for i in range(400): + scalars_df_index = scalars_df_index + 4 + scalars_df_index.to_pandas() + + +@pytest.mark.skipif( + reason="b/366477265: Skip until query complexity error can be reliably triggered." +) +def test_query_complexity_error(scalars_df_index): + # This test requires automatic caching/query decomposition to be turned off + bf_df = scalars_df_index + for _ in range(8): + bf_df = bf_df.merge(bf_df, on="int64_col").head(30) + bf_df = bf_df[bf_df.columns[:20]] + + with pytest.raises( + bigframes.exceptions.QueryComplexityError, match=r"Try using DataFrame\.cache" + ): + bf_df.to_pandas() + + +def test_query_complexity_repeated_joins( + scalars_df_index, scalars_pandas_df_index, with_multiquery_execution +): + pd_df = scalars_pandas_df_index + bf_df = scalars_df_index + for _ in range(8): + # recursively join, resuling in 2^8 - 1 = 255 joins + pd_df = pd_df.merge(pd_df, on="int64_col").head(30) + pd_df = pd_df[pd_df.columns[:20]] + bf_df = bf_df.merge(bf_df, on="int64_col").head(30) + bf_df = bf_df[bf_df.columns[:20]] + + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result, check_index_type=False) + + +def test_query_complexity_repeated_subtrees( + scalars_df_index, scalars_pandas_df_index, with_multiquery_execution +): + # Recursively union the data, if fully inlined has 10^5 identical root tables. + pd_df = scalars_pandas_df_index + bf_df = scalars_df_index + for _ in range(5): + pd_df = pd.concat(10 * [pd_df]).head(5) + bf_df = bpd.concat(10 * [bf_df]).head(5) + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.skipif( + sys.version_info >= (3, 12), + # See: https://github.com/python/cpython/issues/112282 + reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", +) +def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_index): + bf_df = scalars_df_index[["int64_col", "int64_too"]] + pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]] + # Uses LAG analytic operator, each in a new SELECT + for _ in range(50): + bf_df = bf_df.diff() + pd_df = pd_df.diff() + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result) + + +def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created): + dataset_id = dataset_id_not_created + destination_table = f"{dataset_id}.scalars_df" + + result_table = scalars_df_index.to_gbq(destination_table) + assert ( + result_table == destination_table + if destination_table + else result_table is not None + ) + + loaded_scalars_df_index = session.read_gbq(result_table) + assert not loaded_scalars_df_index.empty + + +def test_read_gbq_to_pandas_no_exec(unordered_session: bigframes.Session): + metrics = unordered_session._metrics + execs_pre = metrics.execution_count + df = unordered_session.read_gbq("bigquery-public-data.ml_datasets.penguins") + df.to_pandas() + execs_post = metrics.execution_count + assert df.shape == (344, 7) + assert execs_pre == execs_post + + +def test_to_gbq_table_labels(scalars_df_index): + destination_table = "bigframes-dev.bigframes_tests_sys.table_labels" + result_table = scalars_df_index.to_gbq( + destination_table, labels={"test": "labels"}, if_exists="replace" + ) + client = scalars_df_index._session.bqclient + table = client.get_table(result_table) + assert table.labels + assert table.labels["test"] == "labels" + + +@pytest.mark.parametrize( + ("col_names", "ignore_index"), + [ + pytest.param(["A"], False, id="one_array_false"), + pytest.param(["A"], True, id="one_array_true"), + pytest.param(["B"], False, id="one_float_false"), + pytest.param(["B"], True, id="one_float_true"), + pytest.param(["A", "C"], False, id="two_arrays_false"), + pytest.param(["A", "C"], True, id="two_arrays_true"), + ], +) +def test_dataframe_explode(col_names, ignore_index, session): + data = { + "A": [[0, 1, 2], [], [3, 4]], + "B": 3, + "C": [["a", "b", "c"], np.nan, ["d", "e"]], + } + + metrics = session._metrics + df = bpd.DataFrame(data, session=session) + pd_df = df.to_pandas() + pd_result = pd_df.explode(col_names, ignore_index=ignore_index) + bf_result = df.explode(col_names, ignore_index=ignore_index) + + # Check that to_pandas() results in at most a single query execution + execs_pre = metrics.execution_count + bf_materialized = bf_result.to_pandas() + execs_post = metrics.execution_count + + pd.testing.assert_frame_equal( + bf_materialized, + pd_result, + check_index_type=False, + check_dtype=False, + ) + # we test this property on this method in particular as compilation + # is non-deterministic and won't use the query cache as implemented + assert execs_post - execs_pre <= 1 + + +@pytest.mark.parametrize( + ("ignore_index", "ordered"), + [ + pytest.param(True, True, id="include_index_ordered"), + pytest.param(True, False, id="include_index_unordered"), + pytest.param(False, True, id="ignore_index_ordered"), + ], +) +def test_dataframe_explode_reserve_order(ignore_index, ordered): + data = { + "a": [np.random.randint(0, 10, 10) for _ in range(10)], + "b": [np.random.randint(0, 10, 10) for _ in range(10)], + } + df = bpd.DataFrame(data) + pd_df = pd.DataFrame(data) + + res = df.explode(["a", "b"], ignore_index=ignore_index).to_pandas(ordered=ordered) + pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype( + pd.Int64Dtype() + ) + pd.testing.assert_frame_equal( + res if ordered else res.sort_index(), + pd_res, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("col_names"), + [ + pytest.param([], id="empty", marks=pytest.mark.xfail(raises=ValueError)), + pytest.param( + ["A", "A"], id="duplicate", marks=pytest.mark.xfail(raises=ValueError) + ), + pytest.param("unknown", id="unknown", marks=pytest.mark.xfail(raises=KeyError)), + ], +) +def test_dataframe_explode_xfail(col_names): + df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) + df.explode(col_names) + + +@pytest.mark.parametrize( + ("on", "rule", "origin"), + [ + pytest.param("datetime_col", "100D", "start"), + pytest.param("datetime_col", "30W", "start"), + pytest.param("datetime_col", "5M", "epoch"), + pytest.param("datetime_col", "3Q", "start_day"), + pytest.param("datetime_col", "3YE", "start"), + pytest.param( + "int64_col", "100D", "start", marks=pytest.mark.xfail(raises=TypeError) + ), + pytest.param( + "datetime_col", "100D", "end", marks=pytest.mark.xfail(raises=ValueError) + ), + ], +) +def test__resample_with_column( + scalars_df_index, scalars_pandas_df_index, on, rule, origin +): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + bf_result = ( + scalars_df_index._resample(rule=rule, on=on, origin=origin)[ + ["int64_col", "int64_too"] + ] + .max() + .to_pandas() + ) + pd_result = scalars_pandas_df_index.resample(rule=rule, on=on, origin=origin)[ + ["int64_col", "int64_too"] + ].max() + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("append", "level", "col", "rule"), + [ + pytest.param(False, None, "timestamp_col", "100d"), + pytest.param(True, 1, "timestamp_col", "1200h"), + pytest.param(False, None, "datetime_col", "100d"), + ], +) +def test__resample_with_index( + scalars_df_index, scalars_pandas_df_index, append, level, col, rule +): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df_index = scalars_df_index.set_index(col, append=append) + scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append) + bf_result = ( + scalars_df_index[["int64_col", "int64_too"]] + ._resample(rule=rule, level=level) + .min() + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index[["int64_col", "int64_too"]] + .resample(rule=rule, level=level) + .min() + ) + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("rule", "origin", "data"), + [ + ( + "5h", + "epoch", + { + "timestamp_col": pd.date_range( + start="2021-01-01 13:00:00", periods=30, freq="1h" + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ( + "75min", + "start_day", + { + "timestamp_col": pd.date_range( + start="2021-01-01 13:00:00", periods=30, freq="10min" + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ( + "7s", + "epoch", + { + "timestamp_col": pd.date_range( + start="2021-01-01 13:00:00", periods=30, freq="1s" + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ], +) +def test__resample_start_time(rule, origin, data): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + col = "timestamp_col" + scalars_df_index = bpd.DataFrame(data).set_index(col) + scalars_pandas_df_index = pd.DataFrame(data).set_index(col) + scalars_pandas_df_index.index.name = None + + bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() + + pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param("string[pyarrow]", id="type-string"), + pytest.param(pd.StringDtype(storage="pyarrow"), id="type-literal"), + pytest.param( + {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()}, + id="multiple-types", + ), + ], +) +def test_df_astype(scalars_dfs, dtype): + bf_df, pd_df = scalars_dfs + target_cols = ["bool_col", "int64_col"] + bf_df = bf_df[target_cols] + pd_df = pd_df[target_cols] + + bf_result = bf_df.astype(dtype).to_pandas() + pd_result = pd_df.astype(dtype) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_df_astype_python_types(scalars_dfs): + bf_df, pd_df = scalars_dfs + target_cols = ["bool_col", "int64_col"] + bf_df = bf_df[target_cols] + pd_df = pd_df[target_cols] + + bf_result = bf_df.astype({"bool_col": str, "int64_col": float}).to_pandas() + pd_result = pd_df.astype( + {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()} + ) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_astype_invalid_type_fail(scalars_dfs): + bf_df, _ = scalars_dfs + + with pytest.raises(TypeError, match=r".*Share your use case with.*"): + bf_df.astype(123) + + +def test_agg_with_dict_lists_strings(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "int64_col": ["min", "count"], + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_lists_callables(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": [np.min, np.max], + "int64_col": [np.min, np.var], + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_list_and_str(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "int64_col": "sum", + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_strs(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": "min", + "int64_col": "sum", + "float64_col": "max", + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + pd_result.index = pd_result.index.astype("string[pyarrow]") + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): + bf_df, _ = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "nonexisting_col": ["count"], + } + + with pytest.raises(KeyError): + bf_df.agg(agg_funcs) From 6801ca4dfef8928e8a056df46dcade5e55859f4c Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 23:49:05 +0000 Subject: [PATCH 33/37] notebook update --- notebooks/dataframes/anywidget_mode.ipynb | 66 +++++++---------------- 1 file changed, 19 insertions(+), 47 deletions(-) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 154afea7e1..62caa4c7ee 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "d10bfca4", "metadata": {}, "outputs": [], @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "ca22f059", "metadata": {}, "outputs": [], @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "1bc5aaf3", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "f289d250", "metadata": {}, "outputs": [ @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "42bb02ab", "metadata": {}, "outputs": [ @@ -123,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "ce250157", "metadata": {}, "outputs": [ @@ -142,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6e46f6d1352043a4baee57fa089f2b0c", + "model_id": "1d718cdbafcb42898120637cdb3fa267", "version_major": 2, "version_minor": 0 }, @@ -160,7 +160,7 @@ "Computation deferred. Computation will process 171.4 MB" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -179,22 +179,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "6920d49b", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -217,7 +205,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "88d370b617b545809eb7bb8e5c66ea0e", + "model_id": "519297c3ad19403aa844cbeabcd5eb44", "version_major": 2, "version_minor": 0 }, @@ -251,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "12b68f15", "metadata": {}, "outputs": [ @@ -288,24 +276,10 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "a9d5d13a", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 171.4 MB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -330,7 +304,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "dec19e8788b74219b88bccfc65e3b9c0", + "model_id": "37ba207603aa40a38c9786a210e712fd", "version_major": 2, "version_minor": 0 }, @@ -361,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "added-cell-1", "metadata": {}, "outputs": [ @@ -369,7 +343,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 21 seconds of slot time.\n", + " Query processed 85.9 kB in 23 seconds of slot time.\n", " " ], "text/plain": [ @@ -383,11 +357,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:969: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dataframe.py:869: UserWarning: Converting JSON columns to strings for display. This is temporary and will be removed when the frontend supports JSON types.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dataframe.py:867: UserWarning: Converting JSON columns to strings for display. This is temporary and will be removed when the frontend supports JSON types.\n", " warnings.warn(\n" ] }, @@ -408,7 +382,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "774357b4083c47c8a5e1fd33bb6af188", + "model_id": "379998ea9a744e7b8afd9c1bcb36548d", "version_major": 2, "version_minor": 0 }, @@ -426,7 +400,7 @@ "Computation deferred. Computation will process 0 Bytes" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -447,7 +421,6 @@ ], "metadata": { "kernelspec": { - "display_name": "3.10.18", "display_name": "3.10.18", "language": "python", "name": "python3" @@ -463,7 +436,6 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" - "version": "3.10.18" } }, "nbformat": 4, From 6c3567b7d573dc36e136841c0a2fac6453a3fa76 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 28 Oct 2025 00:07:05 +0000 Subject: [PATCH 34/37] call API on local data for complier.py --- bigframes/core/compile/polars/compiler.py | 31 ++++------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 0a6605b222..754294ec2f 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -622,32 +622,11 @@ def compile_readlocal(self, node: nodes.ReadLocalNode): for scan_item in node.scan_list.items } - # Workaround for PyArrow bug https://github.com/apache/arrow/issues/45262 - # Convert JSON columns to strings before Polars processing - arrow_data = node.local_data_source.data - schema = arrow_data.schema - - # Check if any columns are JSON type - json_field_indices = [ - i - for i, field in enumerate(schema) - if pa.types.is_extension_type(field.type) - and field.type.extension_name == "google:sqlType:json" - ] - - if json_field_indices: - # Convert JSON columns to string columns - new_arrays = [] - new_fields = [] - for i, field in enumerate(schema): - if i in json_field_indices: - # Cast JSON to string - new_arrays.append(arrow_data.column(i).cast(pa.string())) - new_fields.append(pa.field(field.name, pa.string())) - else: - new_arrays.append(arrow_data.column(i)) - new_fields.append(field) - arrow_data = pa.table(new_arrays, schema=pa.schema(new_fields)) + if hasattr(node.local_data_source, "to_arrow"): + schema, batches = node.local_data_source.to_arrow(json_type="string") + arrow_data = pa.Table.from_batches(batches, schema) + else: + arrow_data = node.local_data_source.data lazy_frame = cast(pl.DataFrame, pl.from_arrow(arrow_data)).lazy() lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) From dba9051306312ced3b05ca253f189e73ad688021 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 28 Oct 2025 00:32:06 +0000 Subject: [PATCH 35/37] add more testcase --- bigframes/display/anywidget.py | 2 + notebooks/dataframes/anywidget_mode.ipynb | 18 ++- tests/system/small/test_anywidget.py | 131 ++++++++++++++++++++++ tests/unit/test_dataframe.py | 23 ++++ tests/unit/test_polars_compiler.py | 86 ++++++++++++++ 5 files changed, 255 insertions(+), 5 deletions(-) create mode 100644 tests/unit/test_polars_compiler.py diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index cf5d4e6310..8930c611e9 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -231,6 +231,8 @@ def _set_table_html(self) -> None: cached_data = self._cached_data else: break + + # Get the data for the current page page_data = cached_data.iloc[start:end] # Generate HTML table diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 62caa4c7ee..744971f69e 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -142,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1d718cdbafcb42898120637cdb3fa267", + "model_id": "93dd10072d564a02a0278817d14855a9", "version_major": 2, "version_minor": 0 }, @@ -205,7 +205,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "519297c3ad19403aa844cbeabcd5eb44", + "model_id": "6e2538d446e344ac8505e4706730243e", "version_major": 2, "version_minor": 0 }, @@ -304,7 +304,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "37ba207603aa40a38c9786a210e712fd", + "model_id": "d6faf367ea5d44ad9d275506d870557a", "version_major": 2, "version_minor": 0 }, @@ -333,6 +333,14 @@ "The `AI.GENERATE` function in BigQuery returns results in a JSON column. While BigQuery's JSON type is not natively supported by the underlying Arrow `to_pandas_batches()` method used in anywidget mode ([Apache Arrow issue #45262](https://github.com/apache/arrow/issues/45262)), BigQuery Dataframes automatically converts JSON columns to strings for display. This allows you to view the results of generative AI functions seamlessly." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdadcad6", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 10, @@ -343,7 +351,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 23 seconds of slot time.\n", + " Query processed 85.9 kB in 24 seconds of slot time.\n", " " ], "text/plain": [ @@ -382,7 +390,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "379998ea9a744e7b8afd9c1bcb36548d", + "model_id": "b6d6f3bacc2c43fc9a335e6039db12a5", "version_major": 2, "version_minor": 0 }, diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 890d591de5..0587e13916 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -527,6 +527,137 @@ def test_json_column_anywidget_mode(mock_display, json_df: bf.dataframe.DataFram assert result == "" +def mock_execute_result_with_params( + self, schema, total_rows_val, arrow_batches_val, *args, **kwargs +): + """ + Mocks an execution result with configurable total_rows and arrow_batches. + """ + from bigframes.session.executor import ExecuteResult + + return ExecuteResult( + iter(arrow_batches_val), + schema=schema, + query_job=None, + total_bytes=None, + total_rows=total_rows_val, + ) + + +def test_widget_row_count_should_be_immutable_after_creation( + paginated_bf_df: bf.dataframe.DataFrame, +): + """ + Given a widget created with a specific configuration when global display + options are changed later, the widget's original row_count should remain + unchanged. + """ + from bigframes.display.anywidget import TableWidget + + # Use a context manager to ensure the option is reset + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + widget = TableWidget(paginated_bf_df) + initial_row_count = widget.row_count + + # Change a global option that could influence row count + bf.options.display.max_rows = 10 + + # Verify the row count remains immutable. + assert widget.row_count == initial_row_count + + +class FaultyIterator: + def __iter__(self): + return self + + def __next__(self): + raise ValueError("Simulated read error") + + +def test_widget_should_fallback_to_zero_rows_with_invalid_total_rows( + paginated_bf_df: bf.dataframe.DataFrame, + monkeypatch: pytest.MonkeyPatch, +): + """ + Given an internal component fails to return valid execution data, + when the TableWidget is created, its error_message should be set and displayed. + """ + # Patch the executor's 'execute' method to simulate an error. + monkeypatch.setattr( + "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", + lambda self, *args, **kwargs: mock_execute_result_with_params( + self, paginated_bf_df._block.expr.schema, None, [], *args, **kwargs + ), + ) + + # Create the TableWidget under the error condition. + with bf.option_context("display.repr_mode", "anywidget"): + from bigframes.display.anywidget import TableWidget + + # The widget should handle the faulty data from the mock without crashing. + widget = TableWidget(paginated_bf_df) + + # The widget should have an error message and display it in the HTML. + assert widget.row_count == 0 + assert widget._error_message is not None + assert "Could not determine total row count" in widget._error_message + assert widget._error_message in widget.table_html + + +def test_widget_row_count_reflects_actual_data_available( + paginated_bf_df: bf.dataframe.DataFrame, +): + """ + Test that widget row_count reflects the actual data available, + regardless of theoretical limits. + """ + from bigframes.display.anywidget import TableWidget + + # Set up display options that define a page size. + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + widget = TableWidget(paginated_bf_df) + + # The widget should report the total rows in the DataFrame, + # not limited by page_size (which only affects pagination) + assert widget.row_count == EXPECTED_ROW_COUNT + assert widget.page_size == 2 # Respects the display option + + # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. + + +@pytest.fixture(scope="module") +def empty_json_df(session: bf.Session) -> bf.dataframe.DataFrame: + """Create an empty DataFrame with a JSON column for testing.""" + import bigframes.dtypes + + pandas_df = pd.DataFrame( + { + "a": pd.Series(dtype="int64"), + "b": pd.Series(dtype=bigframes.dtypes.JSON_DTYPE), + } + ) + return session.read_pandas(pandas_df) + + +def test_empty_widget_with_json_column(empty_json_df: bf.dataframe.DataFrame): + """Given an empty DataFrame with a JSON column, the widget should render table headers.""" + with bf.option_context("display.repr_mode", "anywidget"): + from bigframes.display.anywidget import TableWidget + + widget = TableWidget(empty_json_df) + html = widget.table_html + + assert widget.row_count == 0 + assert " bigframes.dataframe.DataFrame: + """Create a DataFrame with a JSON column for testing.""" + import bigframes.dtypes + + pandas_df = pd.DataFrame( + { + "a": [1], + "b": ['{"c": 2, "d": 3}'], + } + ) + pandas_df["b"] = pandas_df["b"].astype(bigframes.dtypes.JSON_DTYPE) + return polars_session.read_pandas(pandas_df) + + +def test_to_pandas_batches_with_json_column(json_df: bigframes.dataframe.DataFrame): + """Test that JSON columns are converted to strings in to_pandas_batches.""" + batches = list(json_df._to_pandas_batches(page_size=10)) + assert len(batches) > 0 + # Verify the JSON column is now string type + assert batches[0]["b"].dtype == pd.StringDtype(storage="pyarrow") diff --git a/tests/unit/test_polars_compiler.py b/tests/unit/test_polars_compiler.py new file mode 100644 index 0000000000..fd620825cc --- /dev/null +++ b/tests/unit/test_polars_compiler.py @@ -0,0 +1,86 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import polars as pl +import pytest + +import bigframes as bf +import bigframes.core.compile.polars.compiler as polars_compiler +import bigframes.core.nodes as nodes +import bigframes.operations.json_ops as json_ops + + +def test_polars_to_json_string(): + """Test ToJSONString operation in Polars compiler.""" + compiler = polars_compiler.PolarsExpressionCompiler() + op = json_ops.ToJSONString() + # Polars doesn't have a native JSON type, it uses strings. + # The operation is a cast to string. + input_expr = pl.lit('{"b": 2}', dtype=pl.String) + result = compiler.compile_op(op, input_expr) + + df = pl.DataFrame({"a": ['{"b": 2}']}).lazy() + result_df = df.with_columns(result.alias("b")).collect() + assert result_df["b"][0] == '{"b": 2}' + assert result_df["b"].dtype == pl.String + + +def test_polars_parse_json(): + """Test ParseJSON operation in Polars compiler.""" + compiler = polars_compiler.PolarsExpressionCompiler() + op = json_ops.ParseJSON() + input_expr = pl.lit('{"b": 2}', dtype=pl.String) + result = compiler.compile_op(op, input_expr) + + df = pl.DataFrame({"a": ['{"b": 2}']}).lazy() + result_df = df.with_columns(result.alias("b")).collect() + # The result of json_decode is a struct + assert isinstance(result_df["b"][0], dict) + assert result_df["b"][0] == {"b": 2} + + +@pytest.mark.skip(reason="Polars does not have json_extract on string expressions") +def test_polars_json_extract(): + """Test JSONExtract operation in Polars compiler.""" + compiler = polars_compiler.PolarsExpressionCompiler() + op = json_ops.JSONExtract(json_path="$.b") + input_expr = pl.lit('{"a": 1, "b": "hello"}', dtype=pl.String) + result = compiler.compile_op(op, input_expr) + + df = pl.DataFrame({"a": ['{"b": "world"}']}).lazy() + result_df = df.with_columns(result.alias("b")).collect() + # json_extract returns a JSON encoded string + assert result_df["b"][0] == '"world"' + + +def test_readlocal_with_json_column(polars_session): + """Test ReadLocalNode compilation with JSON columns.""" + pandas_df = pd.DataFrame({"data": ['{"key": "value"}']}) + pandas_df["data"] = pandas_df["data"].astype(bf.dtypes.JSON_DTYPE) + bf_df = polars_session.read_pandas(pandas_df) + + node = bf_df._block.expr.node + # Traverse the node tree to find the ReadLocalNode + while not isinstance(node, nodes.ReadLocalNode): + node = node.child + assert isinstance(node, nodes.ReadLocalNode) + + compiler = polars_compiler.PolarsCompiler() + lazy_frame = compiler.compile_node(node) + result_df = lazy_frame.collect() + + # The compiler should have converted the JSON column to string. + assert result_df.schema["column_0"] == pl.String + assert result_df["column_0"][0] == '{"key":"value"}' From 0420c64a9020ab4f97fc8c471176507e93b7173b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 28 Oct 2025 00:51:33 +0000 Subject: [PATCH 36/37] modfiy polars import --- tests/unit/test_polars_compiler.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_polars_compiler.py b/tests/unit/test_polars_compiler.py index fd620825cc..95be7d5d00 100644 --- a/tests/unit/test_polars_compiler.py +++ b/tests/unit/test_polars_compiler.py @@ -13,9 +13,19 @@ # limitations under the License. import pandas as pd -import polars as pl import pytest +try: + import polars as pl + + POLARS_INSTALLED = True +except ImportError: + POLARS_INSTALLED = False + +if not POLARS_INSTALLED: + pytest.skip("polars is not installed", allow_module_level=True) + + import bigframes as bf import bigframes.core.compile.polars.compiler as polars_compiler import bigframes.core.nodes as nodes @@ -48,10 +58,9 @@ def test_polars_parse_json(): result_df = df.with_columns(result.alias("b")).collect() # The result of json_decode is a struct assert isinstance(result_df["b"][0], dict) - assert result_df["b"][0] == {"b": 2} + assert result_df["b"][0]["b"] == 2 -@pytest.mark.skip(reason="Polars does not have json_extract on string expressions") def test_polars_json_extract(): """Test JSONExtract operation in Polars compiler.""" compiler = polars_compiler.PolarsExpressionCompiler() @@ -59,10 +68,10 @@ def test_polars_json_extract(): input_expr = pl.lit('{"a": 1, "b": "hello"}', dtype=pl.String) result = compiler.compile_op(op, input_expr) - df = pl.DataFrame({"a": ['{"b": "world"}']}).lazy() + df = pl.DataFrame({"a": ['{"a": 1, "b": "hello"}']}).lazy() result_df = df.with_columns(result.alias("b")).collect() - # json_extract returns a JSON encoded string - assert result_df["b"][0] == '"world"' + # json_path_match returns the raw string value + assert result_df["b"][0] == "hello" def test_readlocal_with_json_column(polars_session): From 907cf2c1728a95ddf3dd5b05e2b7917dbbd21ff1 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 29 Oct 2025 07:07:46 +0000 Subject: [PATCH 37/37] fix failed tests --- bigframes/bigquery/_operations/ai.py | 7 +++++++ bigframes/core/compile/polars/compiler.py | 6 +++--- bigframes/ml/llm.py | 11 ++++++++++- bigframes/series.py | 8 -------- tests/system/small/test_dataframe.py | 12 ++++++++++++ 5 files changed, 32 insertions(+), 12 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 8579f7f298..07f81d87f5 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -123,6 +123,13 @@ def generate( if output_schema is None: output_schema_str = None else: + # Validate output schema types + for col_name, col_type in output_schema.items(): + if col_type.upper() == "JSON": + raise ValueError( + "JSON type is not supported in output_schema. " + "Supported types are: STRING, INT64, FLOAT64, BOOL, ARRAY, and STRUCT." + ) output_schema_str = ", ".join( [f"{name} {sql_type}" for name, sql_type in output_schema.items()] ) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 754294ec2f..e939f80120 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -434,13 +434,13 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @compile_op.register(json_ops.ParseJSON) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - # Parse string as JSON - this should decode, not encode - return input.str.json_decode() + # In Polars, JSON is stored as string, so no decoding needed + return input @compile_op.register(json_ops.JSONExtract) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: assert isinstance(op, json_ops.JSONExtract) - return input.str.json_extract(json_path=op.json_path) + return input.str.json_path_match(op.json_path) @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 531a043c45..edede34e8f 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -731,8 +731,17 @@ def predict( "ground_with_google_search": ground_with_google_search, } if output_schema: + supported_dtypes = ( + "int64", + "float64", + "bool", + "string", + "array", + "struct", + ) output_schema = { - k: utils.standardize_type(v) for k, v in output_schema.items() + k: utils.standardize_type(v, supported_dtypes=supported_dtypes) + for k, v in output_schema.items() } options["output_schema"] = output_schema return self._predict_and_retry( diff --git a/bigframes/series.py b/bigframes/series.py index 5448045092..5177bd0f33 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -611,14 +611,6 @@ def astype( raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") dtype = bigframes.dtypes.bigframes_type(dtype) - # BigQuery doesn't support CAST(json_col AS STRING), but it does support - # TO_JSON_STRING(json_col). - if ( - self.dtype == bigframes.dtypes.JSON_DTYPE - and dtype == bigframes.dtypes.STRING_DTYPE - ): - return self._apply_unary_op(ops.json_ops.ToJSONString()) - return self._apply_unary_op( bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null")) ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 79f8efd00f..a0c0e41a1b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -6142,3 +6142,15 @@ def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): with pytest.raises(KeyError): bf_df.agg(agg_funcs) + + +def test_to_pandas_batches_with_json_columns(session): + """Test that JSON columns are properly handled in to_pandas_batches.""" + # Create a DataFrame with JSON column + df = session.read_gbq('SELECT JSON \'{"key": "value"}\' as json_col') + + # This should not raise an error + batches = df._to_pandas_batches(page_size=10) + next(batches) + + # TODO