fix: dtype parameter ineffective in Series/DataFrame construction (#1354)

chelsea-lin · web-flow · commit b9bdca8285ee · 2025-02-05T13:14:18.000-06:00
* fix: dtype parameter ineffective in Series IO * Revert "docs: update struct examples. (#953)" This reverts commit d632cd0. * skip array tests because of dtype mismatches
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -295,7 +295,10 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool:
     # See: https://stackoverflow.com/a/40312924/101923 and
     # https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html
     # for the way to identify object type.
-    return type_ in ("object", "O") or getattr(type_, "kind", None) == "O"
+    return type_ in ("object", "O") or (
+        getattr(type_, "kind", None) == "O"
+        and getattr(type_, "storage", None) != "pyarrow"
+    )
 
 
 def is_string_like(type_: ExpressionType) -> bool:
diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py
@@ -16,6 +16,7 @@
 
 import geopandas as gpd  # type: ignore
 import pandas as pd
+import pyarrow as pa
 import pytest
 
 import bigframes.bigquery as bbq
@@ -174,7 +175,7 @@ def test_json_extract_array_from_json_strings():
     actual = bbq.json_extract_array(s, "$.a")
     expected = bpd.Series(
         [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None],
-        dtype=pd.StringDtype(storage="pyarrow"),
+        dtype=pd.ArrowDtype(pa.list_(pa.string())),
     )
     pd.testing.assert_series_equal(
         actual.to_pandas(),
@@ -190,7 +191,7 @@ def test_json_extract_array_from_json_array_strings():
     actual = bbq.json_extract_array(s)
     expected = bpd.Series(
         [["1", "2", "3"], [], ["4", "5"]],
-        dtype=pd.StringDtype(storage="pyarrow"),
+        dtype=pd.ArrowDtype(pa.list_(pa.string())),
     )
     pd.testing.assert_series_equal(
         actual.to_pandas(),
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -166,6 +166,19 @@ def test_df_construct_inline_respects_location():
         assert table.location == "europe-west1"
 
 
+def test_df_construct_dtype():
+    data = {
+        "int_col": [1, 2, 3],
+        "string_col": ["1.1", "2.0", "3.5"],
+        "float_col": [1.0, 2.0, 3.0],
+    }
+    dtype = pd.StringDtype(storage="pyarrow")
+    bf_result = dataframe.DataFrame(data, dtype=dtype)
+    pd_result = pd.DataFrame(data, dtype=dtype)
+    pd_result.index = pd_result.index.astype("Int64")
+    pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result)
+
+
 def test_get_column(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     col_name = "int64_col"
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -26,6 +26,7 @@
 import pytest
 import shapely  # type: ignore
 
+import bigframes.features
 import bigframes.pandas
 import bigframes.series as series
 from tests.system.utils import (
@@ -228,6 +229,79 @@ def test_series_construct_geodata():
     )
 
 
+@pytest.mark.parametrize(
+    ("dtype"),
+    [
+        pytest.param(pd.Int64Dtype(), id="int"),
+        pytest.param(pd.Float64Dtype(), id="float"),
+        pytest.param(pd.StringDtype(storage="pyarrow"), id="string"),
+    ],
+)
+def test_series_construct_w_dtype_for_int(dtype):
+    data = [1, 2, 3]
+    expected = pd.Series(data, dtype=dtype)
+    expected.index = expected.index.astype("Int64")
+    series = bigframes.pandas.Series(data, dtype=dtype)
+    pd.testing.assert_series_equal(series.to_pandas(), expected)
+
+
+def test_series_construct_w_dtype_for_struct():
+    # The data shows the struct fields are disordered and correctly handled during
+    # construction.
+    data = [
+        {"a": 1, "c": "pandas", "b": dt.datetime(2020, 1, 20, 20, 20, 20, 20)},
+        {"a": 2, "c": "pandas", "b": dt.datetime(2019, 1, 20, 20, 20, 20, 20)},
+        {"a": 1, "c": "numpy", "b": None},
+    ]
+    dtype = pd.ArrowDtype(
+        pa.struct([("a", pa.int64()), ("c", pa.string()), ("b", pa.timestamp("us"))])
+    )
+    series = bigframes.pandas.Series(data, dtype=dtype)
+    expected = pd.Series(data, dtype=dtype)
+    expected.index = expected.index.astype("Int64")
+    pd.testing.assert_series_equal(series.to_pandas(), expected)
+
+
+def test_series_construct_w_dtype_for_array_string():
+    data = [["1", "2", "3"], [], ["4", "5"]]
+    dtype = pd.ArrowDtype(pa.list_(pa.string()))
+    series = bigframes.pandas.Series(data, dtype=dtype)
+    expected = pd.Series(data, dtype=dtype)
+    expected.index = expected.index.astype("Int64")
+
+    # Skip dtype check due to internal issue b/321013333. This issue causes array types
+    # to be converted to the `object` dtype when calling `to_pandas()`, resulting in
+    # a mismatch with the expected Pandas type.
+    if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable:
+        check_dtype = True
+    else:
+        check_dtype = False
+
+    pd.testing.assert_series_equal(
+        series.to_pandas(), expected, check_dtype=check_dtype
+    )
+
+
+def test_series_construct_w_dtype_for_array_struct():
+    data = [[{"a": 1, "c": "aa"}, {"a": 2, "c": "bb"}], [], [{"a": 3, "c": "cc"}]]
+    dtype = pd.ArrowDtype(pa.list_(pa.struct([("a", pa.int64()), ("c", pa.string())])))
+    series = bigframes.pandas.Series(data, dtype=dtype)
+    expected = pd.Series(data, dtype=dtype)
+    expected.index = expected.index.astype("Int64")
+
+    # Skip dtype check due to internal issue b/321013333. This issue causes array types
+    # to be converted to the `object` dtype when calling `to_pandas()`, resulting in
+    # a mismatch with the expected Pandas type.
+    if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable:
+        check_dtype = True
+    else:
+        check_dtype = False
+
+    pd.testing.assert_series_equal(
+        series.to_pandas(), expected, check_dtype=check_dtype
+    )
+
+
 def test_series_keys(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     bf_result = scalars_df["int64_col"].keys().to_pandas()
diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py
@@ -87,12 +87,12 @@ def field(self, name_or_index: str | int):
             >>> bpd.options.display.progress_bar = None
             >>> s = bpd.Series(
             ...     [
-            ...         {"project": "pandas", "version": 1},
-            ...         {"project": "pandas", "version": 2},
-            ...         {"project": "numpy", "version": 1},
+            ...         {"version": 1, "project": "pandas"},
+            ...         {"version": 2, "project": "pandas"},
+            ...         {"version": 1, "project": "numpy"},
             ...     ],
             ...     dtype=bpd.ArrowDtype(pa.struct(
-            ...         [("project", pa.string()), ("version", pa.int64())]
+            ...         [("version", pa.int64()), ("project", pa.string())]
             ...     ))
             ... )
 
@@ -106,7 +106,7 @@ def field(self, name_or_index: str | int):
 
         Extract by field index.
 
-            >>> s.struct.field(1)
+            >>> s.struct.field(0)
             0    1
             1    2
             2    1
@@ -133,22 +133,22 @@ def explode(self):
             >>> bpd.options.display.progress_bar = None
             >>> s = bpd.Series(
             ...     [
-            ...         {"project": "pandas", "version": 1},
-            ...         {"project": "pandas", "version": 2},
-            ...         {"project": "numpy", "version": 1},
+            ...         {"version": 1, "project": "pandas"},
+            ...         {"version": 2, "project": "pandas"},
+            ...         {"version": 1, "project": "numpy"},
             ...     ],
             ...     dtype=bpd.ArrowDtype(pa.struct(
-            ...         [("project", pa.string()), ("version", pa.int64())]
+            ...         [("version", pa.int64()), ("project", pa.string())]
             ...     ))
             ... )
 
         Extract all child fields.
 
             >>> s.struct.explode()
-               project version
-            0   pandas       1
-            1   pandas       2
-            2    numpy       1
+               version project
+            0        1  pandas
+            1        2  pandas
+            2        1   numpy
             <BLANKLINE>
             [3 rows x 2 columns]
 
@@ -178,8 +178,8 @@ def dtypes(self):
             ...     ))
             ... )
             >>> s.struct.dtypes()
-            project    string[pyarrow]
             version              Int64
+            project    string[pyarrow]
             dtype: object
 
         Returns:
@@ -205,21 +205,21 @@ def explode(self, column, *, separator: str = "."):
             >>> countries = bpd.Series(["cn", "es", "us"])
             >>> files = bpd.Series(
             ...     [
-            ...         {"project": "pandas", "version": 1},
-            ...         {"project": "pandas", "version": 2},
-            ...         {"project": "numpy", "version": 1},
+            ...         {"version": 1, "project": "pandas"},
+            ...         {"version": 2, "project": "pandas"},
+            ...         {"version": 1, "project": "numpy"},
             ...     ],
             ...     dtype=bpd.ArrowDtype(pa.struct(
-            ...         [("project", pa.string()), ("version", pa.int64())]
+            ...         [("version", pa.int64()), ("project", pa.string())]
             ...     ))
             ... )
             >>> downloads = bpd.Series([100, 200, 300])
             >>> df = bpd.DataFrame({"country": countries, "file": files, "download_count": downloads})
             >>> df.struct.explode("file")
-              country file.project  file.version  download_count
-            0      cn       pandas             1             100
-            1      es       pandas             2             200
-            2      us        numpy             1             300
+              country  file.version file.project  download_count
+            0      cn             1       pandas             100
+            1      es             2       pandas             200
+            2      us             1        numpy             300
             <BLANKLINE>
             [3 rows x 4 columns]