Correctly display DataFrames with JSON columns in anywidget

shuoweil · shuoweil · commit 8c3451266c28 · 2025-10-30T21:46:08.000Z
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -43,6 +43,7 @@
 import warnings
 
 import bigframes_vendored.constants as constants
+import db_dtypes
 import google.cloud.bigquery as bigquery
 import numpy
 import pandas as pd
@@ -134,6 +135,21 @@ class MaterializationOptions:
     ordered: bool = True
 
 
+def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType:
+    """Recursively replace JSONArrowType with string type."""
+    if isinstance(pa_type, db_dtypes.JSONArrowType):
+        return pa.string()
+    if isinstance(pa_type, pa.ListType):
+        return pa.list_(_replace_json_arrow_with_string(pa_type.value_type))
+    if isinstance(pa_type, pa.StructType):
+        new_fields = [
+            field.with_type(_replace_json_arrow_with_string(field.type))
+            for field in pa_type
+        ]
+        return pa.struct(new_fields)
+    return pa_type
+
+
 class Block:
     """A immutable 2D data structure."""
 
@@ -715,12 +731,32 @@ def to_pandas_batches(
         # To reduce the number of edge cases to consider when working with the
         # results of this, always return at least one DataFrame. See:
         # b/428918844.
-        empty_val = pd.DataFrame(
-            {
-                col: pd.Series([], dtype=self.expr.get_column_type(col))
-                for col in itertools.chain(self.value_columns, self.index_columns)
-            }
-        )
+        series_map = {}
+        for col in itertools.chain(self.value_columns, self.index_columns):
+            dtype = self.expr.get_column_type(col)
+            if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype):
+                # Due to a limitation in Apache Arrow (#45262), JSON columns are not
+                # natively supported by the to_pandas_batches() method, which is
+                # used by the anywidget backend.
+                # Workaround for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
+                # PyArrow doesn't support creating an empty array with db_dtypes.JSONArrowType,
+                # especially when nested.
+                # Create with string type and then cast.
+
+                # MyPy doesn't automatically narrow the type of 'dtype' here,
+                # so we add an explicit check.
+                if isinstance(dtype, pd.ArrowDtype):
+                    safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype)
+                    safe_dtype = pd.ArrowDtype(safe_pa_type)
+                    series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype)
+                else:
+                    # This branch should ideally not be reached if
+                    # contains_db_dtypes_json_dtype is accurate,
+                    # but it's here for MyPy's sake.
+                    series_map[col] = pd.Series([], dtype=dtype)
+            else:
+                series_map[col] = pd.Series([], dtype=dtype)
+        empty_val = pd.DataFrame(series_map)
         dfs = map(
             lambda a: a[0],
             itertools.zip_longest(
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -783,8 +783,6 @@ def __repr__(self) -> str:
 
         opts = bigframes.options.display
         max_results = opts.max_rows
-        # anywdiget mode uses the same display logic as the "deferred" mode
-        # for faster execution
         if opts.repr_mode in ("deferred", "anywidget"):
             return formatter.repr_query_job(self._compute_dry_run())
 
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py
@@ -52,6 +52,8 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
         result_rows = 0
 
         for batch in self._arrow_batches:
+            # Convert JSON columns to strings before casting
+            batch = self._convert_json_to_string(batch)
             batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow())
             result_rows += batch.num_rows
 
@@ -67,6 +69,38 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
 
             yield batch
 
+    def _convert_json_to_string(
+        self, batch: pyarrow.RecordBatch
+    ) -> pyarrow.RecordBatch:
+        """Convert JSON arrow extension types to string to avoid PyArrow compatibility issues."""
+        import logging
+
+        new_arrays = []
+        new_fields = []
+
+        for i, field in enumerate(batch.schema):
+            array = batch.column(i)
+
+            # Check if this column should be JSON based on our schema
+            schema_item = next(
+                (item for item in self.schema.items if item.column == field.name), None
+            )
+
+            if schema_item and schema_item.dtype == bigframes.dtypes.JSON_DTYPE:
+                logging.info(f"Converting JSON column: {field.name}")
+                # Convert JSONArrowType to string
+                if array.type == bigframes.dtypes.JSON_ARROW_TYPE:
+                    array = array.cast(pyarrow.string())
+                new_fields.append(pyarrow.field(field.name, pyarrow.string()))
+            else:
+                new_fields.append(field)
+
+            new_arrays.append(array)
+
+        return pyarrow.RecordBatch.from_arrays(
+            new_arrays, schema=pyarrow.schema(new_fields)
+        )
+
     def to_arrow_table(self) -> pyarrow.Table:
         # Need to provide schema if no result rows, as arrow can't infer
         # If ther are rows, it is safest to infer schema from batches.
diff --git a/mypy.ini b/mypy.ini
@@ -44,3 +44,6 @@ ignore_missing_imports = True
 
 [mypy-anywidget]
 ignore_missing_imports = True
+
+[mypy-db_dtypes]
+ignore_missing_imports = True
diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb
@@ -35,7 +35,16 @@
    "execution_count": 2,
    "id": "ca22f059",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/_python_version_support.py:266: FutureWarning: You are using a Python version (3.10.15) which Google will stop supporting in new releases of google.api_core once it reaches its end of life (2026-10-04). Please upgrade to the latest Python version, or at least Python 3.11, to continue receiving updates for google.api_core past that date.\n",
+      "  warnings.warn(message, FutureWarning)\n"
+     ]
+    }
+   ],
    "source": [
     "import bigframes.pandas as bpd"
    ]
@@ -142,9 +151,9 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "aafd4f912b5f42e0896aa5f0c2c62620",
+       "model_id": "473b016aa6b24c86aafc6372352e822d",
        "version_major": 2,
-       "version_minor": 0
+       "version_minor": 1
       },
       "text/plain": [
        "TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
@@ -205,16 +214,17 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5ec0ad9f11874d4f9d8edbc903ee7b5d",
+       "model_id": "339279cc312e4e7fb67923e4e6ad7779",
        "version_major": 2,
-       "version_minor": 0
+       "version_minor": 1
       },
       "text/plain": [
        "TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
       ]
      },
+     "execution_count": 7,
      "metadata": {},
-     "output_type": "display_data"
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -304,16 +314,17 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "651b5aac958c408183775152c2573a03",
+       "model_id": "8ff1f64c44304da0944eadbd0fb3981d",
        "version_major": 2,
-       "version_minor": 0
+       "version_minor": 1
       },
       "text/plain": [
        "TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
       ]
      },
+     "execution_count": 9,
      "metadata": {},
-     "output_type": "display_data"
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -323,6 +334,96 @@
     "print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n",
     "small_widget"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "added-cell-2",
+   "metadata": {},
+   "source": [
+    "### Displaying Generative AI results containing JSON\n",
+    "The `AI.GENERATE` function in BigQuery returns results in a JSON column. While BigQuery's JSON type is not natively supported by the underlying Arrow `to_pandas_batches()` method used in anywidget mode ([Apache Arrow issue #45262](https://github.com/apache/arrow/issues/45262)), BigQuery Dataframes automatically converts JSON columns to strings for display. This allows you to view the results of generative AI functions seamlessly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "added-cell-1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "✅ Completed. \n",
+       "    Query processed 85.9 kB in 15 seconds of slot time.\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:969: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
+      "instead of using `db_dtypes` in the future when available in pandas\n",
+      "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
+      "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "✅ Completed. "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a6d61e48cca642b7a57e6431359b4cc4",
+       "version_major": 2,
+       "version_minor": 1
+      },
+      "text/plain": [
+       "TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [],
+      "text/plain": [
+       "Computation deferred. Computation will process 0 Bytes"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "bpd._read_gbq_colab(\"\"\"\n",
+    "  SELECT\n",
+    "    AI.GENERATE(\n",
+    "      prompt=>(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\n",
+    "      connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\n",
+    "      output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\n",
+    "    *\n",
+    "  FROM `bigquery-public-data.labeled_patents.extracted_data`\n",
+    "  LIMIT 5;\n",
+    "\"\"\")"
+   ]
   }
  ],
  "metadata": {