fix: managed arrow table iterates None list and struct incorrectly

chelsea-lin · chelsea-lin · commit e5e14e482d1e · 2025-10-30T17:52:22.000Z
diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py
@@ -234,9 +234,16 @@ def _(
         value_generator = iter_array(
             array.flatten(), bigframes.dtypes.get_array_inner_type(dtype)
         )
-        for (start, end) in _pairwise(array.offsets):
-            arr_size = end.as_py() - start.as_py()
-            yield list(itertools.islice(value_generator, arr_size))
+        offset_generator = iter_array(array.offsets, bigframes.dtypes.INT_DTYPE)
+        is_null_generator = iter_array(array.is_null(), bigframes.dtypes.BOOL_DTYPE)
+        previous_offset = next(offset_generator)
+        for is_null, offset in zip(is_null_generator, offset_generator):
+            arr_size = offset - previous_offset
+            previous_offset = offset
+            if is_null:
+                yield None
+            else:
+                yield list(itertools.islice(value_generator, arr_size))
 
     @iter_array.register
     def _(
@@ -248,8 +255,14 @@ def _(
             sub_generators[field_name] = iter_array(array.field(field_name), dtype)
 
         keys = list(sub_generators.keys())
-        for row_values in zip(*sub_generators.values()):
-            yield {key: value for key, value in zip(keys, row_values)}
+        row_values_iter = zip(*sub_generators.values())
+        is_null_iter = array.is_null()
+
+        for is_row_null, row_values in zip(is_null_iter, row_values_iter):
+            if not is_row_null:
+                yield {key: value for key, value in zip(keys, row_values)}
+            else:
+                yield None
 
     for batch in table.to_batches():
         sub_generators: dict[str, Generator[Any, None, None]] = {}
@@ -335,7 +348,7 @@ def _adapt_arrow_array(array: pa.Array) -> tuple[pa.Array, bigframes.dtypes.Dtyp
         new_value = pa.ListArray.from_arrays(
             array.offsets, values, mask=array.is_null()
         )
-        return new_value.fill_null([]), bigframes.dtypes.list_type(values_type)
+        return new_value, bigframes.dtypes.list_type(values_type)
     if array.type == bigframes.dtypes.JSON_ARROW_TYPE:
         return _canonicalize_json(array), bigframes.dtypes.JSON_DTYPE
     target_type = logical_type_replacements(array.type)
diff --git a/tests/system/small/engines/test_read_local.py b/tests/system/small/engines/test_read_local.py
@@ -88,8 +88,9 @@ def test_engines_read_local_w_zero_row_source(
     assert_equivalence_execution(local_node, REFERENCE_ENGINE, engine)
 
 
-# TODO: Fix sqlglot impl
-@pytest.mark.parametrize("engine", ["polars", "bq", "pyarrow"], indirect=True)
+@pytest.mark.parametrize(
+    "engine", ["polars", "bq", "pyarrow", "bq-sqlglot"], indirect=True
+)
 def test_engines_read_local_w_nested_source(
     fake_session: bigframes.Session,
     nested_data_source: local_data.ManagedArrowTable,
diff --git a/tests/unit/test_local_data.py b/tests/unit/test_local_data.py
@@ -20,20 +20,21 @@
 
 pd_data = pd.DataFrame(
     {
-        "ints": [10, 20, 30, 40],
-        "nested_ints": [[1, 2], [3, 4, 5], [], [20, 30]],
-        "structs": [{"a": 100}, {}, {"b": 200}, {"b": 300}],
+        "ints": [10, 20, 30, 40, 50],
+        "nested_ints": [[1, 2], None, [3, 4, 5], [], [20, 30]],
+        "structs": [{"a": 100}, None, {}, {"b": 200}, {"b": 300}],
     }
 )
 
 pd_data_normalized = pd.DataFrame(
     {
-        "ints": pd.Series([10, 20, 30, 40], dtype=dtypes.INT_DTYPE),
+        "ints": pd.Series([10, 20, 30, 40, 50], dtype=dtypes.INT_DTYPE),
         "nested_ints": pd.Series(
-            [[1, 2], [3, 4, 5], [], [20, 30]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))
+            [[1, 2], None, [3, 4, 5], [], [20, 30]],
+            dtype=pd.ArrowDtype(pa.list_(pa.int64())),
         ),
         "structs": pd.Series(
-            [{"a": 100}, {}, {"b": 200}, {"b": 300}],
+            [{"a": 100}, None, {}, {"b": 200}, {"b": 300}],
             dtype=pd.ArrowDtype(pa.struct({"a": pa.int64(), "b": pa.int64()})),
         ),
     }
@@ -122,11 +123,11 @@ def test_local_data_well_formed_round_trip_chunked():
 
 def test_local_data_well_formed_round_trip_sliced():
     pa_table = pa.Table.from_pandas(pd_data, preserve_index=False)
-    as_rechunked_pyarrow = pa.Table.from_batches(pa_table.slice(2, 4).to_batches())
+    as_rechunked_pyarrow = pa.Table.from_batches(pa_table.slice(0, 4).to_batches())
     local_entry = local_data.ManagedArrowTable.from_pyarrow(as_rechunked_pyarrow)
     result = pd.DataFrame(local_entry.itertuples(), columns=pd_data.columns)
     pandas.testing.assert_frame_equal(
-        pd_data_normalized[2:4].reset_index(drop=True),
+        pd_data_normalized[0:4].reset_index(drop=True),
         result.reset_index(drop=True),
         check_dtype=False,
     )
@@ -143,3 +144,25 @@ def test_local_data_not_equal_other():
     local_entry2 = local_data.ManagedArrowTable.from_pandas(pd_data[::2])
     assert local_entry != local_entry2
     assert hash(local_entry) != hash(local_entry2)
+
+
+def test_local_data_itertuples_struct_none():
+    pd_data = pd.DataFrame(
+        {
+            "structs": [{"a": 100}, None, {"b": 200}, {"b": 300}],
+        }
+    )
+    local_entry = local_data.ManagedArrowTable.from_pandas(pd_data)
+    result = list(local_entry.itertuples())
+    assert result[1][0] is None
+
+
+def test_local_data_itertuples_list_none():
+    pd_data = pd.DataFrame(
+        {
+            "lists": [[1, 2], None, [3, 4]],
+        }
+    )
+    local_entry = local_data.ManagedArrowTable.from_pandas(pd_data)
+    result = list(local_entry.itertuples())
+    assert result[1][0] is None