Skip to content

Commit 8c34512

Browse files
committed
Correctly display DataFrames with JSON columns in anywidget
1 parent d97cafc commit 8c34512

File tree

5 files changed

+189
-17
lines changed

5 files changed

+189
-17
lines changed

bigframes/core/blocks.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import warnings
4444

4545
import bigframes_vendored.constants as constants
46+
import db_dtypes
4647
import google.cloud.bigquery as bigquery
4748
import numpy
4849
import pandas as pd
@@ -134,6 +135,21 @@ class MaterializationOptions:
134135
ordered: bool = True
135136

136137

138+
def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType:
139+
"""Recursively replace JSONArrowType with string type."""
140+
if isinstance(pa_type, db_dtypes.JSONArrowType):
141+
return pa.string()
142+
if isinstance(pa_type, pa.ListType):
143+
return pa.list_(_replace_json_arrow_with_string(pa_type.value_type))
144+
if isinstance(pa_type, pa.StructType):
145+
new_fields = [
146+
field.with_type(_replace_json_arrow_with_string(field.type))
147+
for field in pa_type
148+
]
149+
return pa.struct(new_fields)
150+
return pa_type
151+
152+
137153
class Block:
138154
"""A immutable 2D data structure."""
139155

@@ -715,12 +731,32 @@ def to_pandas_batches(
715731
# To reduce the number of edge cases to consider when working with the
716732
# results of this, always return at least one DataFrame. See:
717733
# b/428918844.
718-
empty_val = pd.DataFrame(
719-
{
720-
col: pd.Series([], dtype=self.expr.get_column_type(col))
721-
for col in itertools.chain(self.value_columns, self.index_columns)
722-
}
723-
)
734+
series_map = {}
735+
for col in itertools.chain(self.value_columns, self.index_columns):
736+
dtype = self.expr.get_column_type(col)
737+
if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype):
738+
# Due to a limitation in Apache Arrow (#45262), JSON columns are not
739+
# natively supported by the to_pandas_batches() method, which is
740+
# used by the anywidget backend.
741+
# Workaround for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
742+
# PyArrow doesn't support creating an empty array with db_dtypes.JSONArrowType,
743+
# especially when nested.
744+
# Create with string type and then cast.
745+
746+
# MyPy doesn't automatically narrow the type of 'dtype' here,
747+
# so we add an explicit check.
748+
if isinstance(dtype, pd.ArrowDtype):
749+
safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype)
750+
safe_dtype = pd.ArrowDtype(safe_pa_type)
751+
series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype)
752+
else:
753+
# This branch should ideally not be reached if
754+
# contains_db_dtypes_json_dtype is accurate,
755+
# but it's here for MyPy's sake.
756+
series_map[col] = pd.Series([], dtype=dtype)
757+
else:
758+
series_map[col] = pd.Series([], dtype=dtype)
759+
empty_val = pd.DataFrame(series_map)
724760
dfs = map(
725761
lambda a: a[0],
726762
itertools.zip_longest(

bigframes/dataframe.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -783,8 +783,6 @@ def __repr__(self) -> str:
783783

784784
opts = bigframes.options.display
785785
max_results = opts.max_rows
786-
# anywdiget mode uses the same display logic as the "deferred" mode
787-
# for faster execution
788786
if opts.repr_mode in ("deferred", "anywidget"):
789787
return formatter.repr_query_job(self._compute_dry_run())
790788

bigframes/session/executor.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
5252
result_rows = 0
5353

5454
for batch in self._arrow_batches:
55+
# Convert JSON columns to strings before casting
56+
batch = self._convert_json_to_string(batch)
5557
batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow())
5658
result_rows += batch.num_rows
5759

@@ -67,6 +69,38 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
6769

6870
yield batch
6971

72+
def _convert_json_to_string(
73+
self, batch: pyarrow.RecordBatch
74+
) -> pyarrow.RecordBatch:
75+
"""Convert JSON arrow extension types to string to avoid PyArrow compatibility issues."""
76+
import logging
77+
78+
new_arrays = []
79+
new_fields = []
80+
81+
for i, field in enumerate(batch.schema):
82+
array = batch.column(i)
83+
84+
# Check if this column should be JSON based on our schema
85+
schema_item = next(
86+
(item for item in self.schema.items if item.column == field.name), None
87+
)
88+
89+
if schema_item and schema_item.dtype == bigframes.dtypes.JSON_DTYPE:
90+
logging.info(f"Converting JSON column: {field.name}")
91+
# Convert JSONArrowType to string
92+
if array.type == bigframes.dtypes.JSON_ARROW_TYPE:
93+
array = array.cast(pyarrow.string())
94+
new_fields.append(pyarrow.field(field.name, pyarrow.string()))
95+
else:
96+
new_fields.append(field)
97+
98+
new_arrays.append(array)
99+
100+
return pyarrow.RecordBatch.from_arrays(
101+
new_arrays, schema=pyarrow.schema(new_fields)
102+
)
103+
70104
def to_arrow_table(self) -> pyarrow.Table:
71105
# Need to provide schema if no result rows, as arrow can't infer
72106
# If ther are rows, it is safest to infer schema from batches.

mypy.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,6 @@ ignore_missing_imports = True
4444

4545
[mypy-anywidget]
4646
ignore_missing_imports = True
47+
48+
[mypy-db_dtypes]
49+
ignore_missing_imports = True

notebooks/dataframes/anywidget_mode.ipynb

Lines changed: 110 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,16 @@
3535
"execution_count": 2,
3636
"id": "ca22f059",
3737
"metadata": {},
38-
"outputs": [],
38+
"outputs": [
39+
{
40+
"name": "stderr",
41+
"output_type": "stream",
42+
"text": [
43+
"/usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/_python_version_support.py:266: FutureWarning: You are using a Python version (3.10.15) which Google will stop supporting in new releases of google.api_core once it reaches its end of life (2026-10-04). Please upgrade to the latest Python version, or at least Python 3.11, to continue receiving updates for google.api_core past that date.\n",
44+
" warnings.warn(message, FutureWarning)\n"
45+
]
46+
}
47+
],
3948
"source": [
4049
"import bigframes.pandas as bpd"
4150
]
@@ -142,9 +151,9 @@
142151
{
143152
"data": {
144153
"application/vnd.jupyter.widget-view+json": {
145-
"model_id": "aafd4f912b5f42e0896aa5f0c2c62620",
154+
"model_id": "473b016aa6b24c86aafc6372352e822d",
146155
"version_major": 2,
147-
"version_minor": 0
156+
"version_minor": 1
148157
},
149158
"text/plain": [
150159
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
@@ -205,16 +214,17 @@
205214
{
206215
"data": {
207216
"application/vnd.jupyter.widget-view+json": {
208-
"model_id": "5ec0ad9f11874d4f9d8edbc903ee7b5d",
217+
"model_id": "339279cc312e4e7fb67923e4e6ad7779",
209218
"version_major": 2,
210-
"version_minor": 0
219+
"version_minor": 1
211220
},
212221
"text/plain": [
213222
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
214223
]
215224
},
225+
"execution_count": 7,
216226
"metadata": {},
217-
"output_type": "display_data"
227+
"output_type": "execute_result"
218228
}
219229
],
220230
"source": [
@@ -304,16 +314,17 @@
304314
{
305315
"data": {
306316
"application/vnd.jupyter.widget-view+json": {
307-
"model_id": "651b5aac958c408183775152c2573a03",
317+
"model_id": "8ff1f64c44304da0944eadbd0fb3981d",
308318
"version_major": 2,
309-
"version_minor": 0
319+
"version_minor": 1
310320
},
311321
"text/plain": [
312322
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
313323
]
314324
},
325+
"execution_count": 9,
315326
"metadata": {},
316-
"output_type": "display_data"
327+
"output_type": "execute_result"
317328
}
318329
],
319330
"source": [
@@ -323,6 +334,96 @@
323334
"print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n",
324335
"small_widget"
325336
]
337+
},
338+
{
339+
"cell_type": "markdown",
340+
"id": "added-cell-2",
341+
"metadata": {},
342+
"source": [
343+
"### Displaying Generative AI results containing JSON\n",
344+
"The `AI.GENERATE` function in BigQuery returns results in a JSON column. While BigQuery's JSON type is not natively supported by the underlying Arrow `to_pandas_batches()` method used in anywidget mode ([Apache Arrow issue #45262](https://github.com/apache/arrow/issues/45262)), BigQuery Dataframes automatically converts JSON columns to strings for display. This allows you to view the results of generative AI functions seamlessly."
345+
]
346+
},
347+
{
348+
"cell_type": "code",
349+
"execution_count": 10,
350+
"id": "added-cell-1",
351+
"metadata": {},
352+
"outputs": [
353+
{
354+
"data": {
355+
"text/html": [
356+
"✅ Completed. \n",
357+
" Query processed 85.9 kB in 15 seconds of slot time.\n",
358+
" "
359+
],
360+
"text/plain": [
361+
"<IPython.core.display.HTML object>"
362+
]
363+
},
364+
"metadata": {},
365+
"output_type": "display_data"
366+
},
367+
{
368+
"name": "stderr",
369+
"output_type": "stream",
370+
"text": [
371+
"/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:969: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
372+
"instead of using `db_dtypes` in the future when available in pandas\n",
373+
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
374+
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
375+
]
376+
},
377+
{
378+
"data": {
379+
"text/html": [
380+
"✅ Completed. "
381+
],
382+
"text/plain": [
383+
"<IPython.core.display.HTML object>"
384+
]
385+
},
386+
"metadata": {},
387+
"output_type": "display_data"
388+
},
389+
{
390+
"data": {
391+
"application/vnd.jupyter.widget-view+json": {
392+
"model_id": "a6d61e48cca642b7a57e6431359b4cc4",
393+
"version_major": 2,
394+
"version_minor": 1
395+
},
396+
"text/plain": [
397+
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
398+
]
399+
},
400+
"metadata": {},
401+
"output_type": "display_data"
402+
},
403+
{
404+
"data": {
405+
"text/html": [],
406+
"text/plain": [
407+
"Computation deferred. Computation will process 0 Bytes"
408+
]
409+
},
410+
"execution_count": 10,
411+
"metadata": {},
412+
"output_type": "execute_result"
413+
}
414+
],
415+
"source": [
416+
"bpd._read_gbq_colab(\"\"\"\n",
417+
" SELECT\n",
418+
" AI.GENERATE(\n",
419+
" prompt=>(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\n",
420+
" connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\n",
421+
" output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\n",
422+
" *\n",
423+
" FROM `bigquery-public-data.labeled_patents.extracted_data`\n",
424+
" LIMIT 5;\n",
425+
"\"\"\")"
426+
]
326427
}
327428
],
328429
"metadata": {

0 commit comments

Comments
 (0)