Skip to content

Commit 76790a8

Browse files
chelsea-lingcf-owl-bot[bot]tswast
authoredAug 8, 2024··
feat: create db_dtypes JSONDtype and JSONArray (#284)
* Copy JSONDtype and JSONArray from tests/extension/json and their tests * formatting * converts to ArrowStringArray * box and unbox between string(storage) and dict(getitem) * minor * fix test_getitem_scalar test * add docstring and remove unused functions * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix lint * address some comments * supports all types except Array * support array type * only import when pandas version is higher than 1.5.0 * exclude groupby and other tests * others * skip jsondtype and jsonarray * fixing * fix coverage file name * add a simple unit test * unit-test for some functionalities * address comments * fix test cover * fixing * Update db_dtypes/json.py * fixing * fixing * add pyarrow_dtypes * fixing --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Tim Sweña (Swast) <swast@google.com>
1 parent c2bce0b commit 76790a8

File tree

6 files changed

+873
-9
lines changed

6 files changed

+873
-9
lines changed
 

‎.github/workflows/unittest.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ jobs:
7676
python -m pip install nox
7777
- name: Run compliance tests
7878
env:
79-
COVERAGE_FILE: .coverage-${{ matrix.python }}
79+
COVERAGE_FILE: .coverage-compliance-${{ matrix.python }}
8080
run: |
8181
nox -s compliance-${{ matrix.python }}
8282
- name: Upload coverage results

‎db_dtypes/__init__.py

+26-8
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,14 @@
4343
# nanosecond precision when boxing scalars.
4444
_NP_BOX_DTYPE = "datetime64[us]"
4545

46-
pandas_release = packaging.version.parse(pandas.__version__).release
46+
47+
# To use JSONArray and JSONDtype, you'll need Pandas 1.5.0 or later. With the removal
48+
# of Python 3.7 compatibility, the minimum Pandas version will be updated to 1.5.0.
49+
if packaging.version.Version(pandas.__version__) >= packaging.version.Version("1.5.0"):
50+
from db_dtypes.json import JSONArray, JSONDtype
51+
else:
52+
JSONArray = None
53+
JSONDtype = None
4754

4855

4956
@pandas.api.extensions.register_extension_dtype
@@ -337,10 +344,21 @@ def __sub__(self, other):
337344
return super().__sub__(other)
338345

339346

340-
__all__ = [
341-
"__version__",
342-
"DateArray",
343-
"DateDtype",
344-
"TimeArray",
345-
"TimeDtype",
346-
]
347+
if not JSONArray or not JSONDtype:
348+
__all__ = [
349+
"__version__",
350+
"DateArray",
351+
"DateDtype",
352+
"TimeArray",
353+
"TimeDtype",
354+
]
355+
else:
356+
__all__ = [
357+
"__version__",
358+
"DateArray",
359+
"DateDtype",
360+
"JSONDtype",
361+
"JSONArray",
362+
"TimeArray",
363+
"TimeDtype",
364+
]

‎db_dtypes/json.py

+209
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import json
18+
19+
import numpy as np
20+
import pandas as pd
21+
import pandas.arrays as arrays
22+
import pandas.core.dtypes.common as common
23+
import pandas.core.indexers as indexers
24+
import pyarrow as pa
25+
import pyarrow.compute
26+
27+
28+
@pd.api.extensions.register_extension_dtype
29+
class JSONDtype(pd.api.extensions.ExtensionDtype):
30+
"""Extension dtype for BigQuery JSON data."""
31+
32+
name = "dbjson"
33+
34+
@property
35+
def na_value(self) -> pd.NA:
36+
"""Default NA value to use for this type."""
37+
return pd.NA
38+
39+
@property
40+
def type(self) -> type[str]:
41+
"""
42+
Return the scalar type for the array elements.
43+
The standard JSON data types can be one of `dict`, `list`, `str`, `int`, `float`,
44+
`bool` and `None`. However, this method returns a `str` type to indicate its
45+
storage type, because the union of multiple types are not supported well in pandas.
46+
"""
47+
return str
48+
49+
@property
50+
def pyarrow_dtype(self):
51+
"""Return the pyarrow data type used for storing data in the pyarrow array."""
52+
return pa.string()
53+
54+
@property
55+
def _is_numeric(self) -> bool:
56+
return False
57+
58+
@property
59+
def _is_boolean(self) -> bool:
60+
return False
61+
62+
@classmethod
63+
def construct_array_type(cls):
64+
"""Return the array type associated with this dtype."""
65+
return JSONArray
66+
67+
68+
class JSONArray(arrays.ArrowExtensionArray):
69+
"""Extension array that handles BigQuery JSON data, leveraging a string-based
70+
pyarrow array for storage. It enables seamless conversion to JSON objects when
71+
accessing individual elements."""
72+
73+
_dtype = JSONDtype()
74+
75+
def __init__(self, values, dtype=None, copy=False) -> None:
76+
self._dtype = JSONDtype()
77+
if isinstance(values, pa.Array):
78+
self._pa_array = pa.chunked_array([values])
79+
elif isinstance(values, pa.ChunkedArray):
80+
self._pa_array = values
81+
else:
82+
raise ValueError(f"Unsupported type '{type(values)}' for JSONArray")
83+
84+
@classmethod
85+
def _box_pa(
86+
cls, value, pa_type: pa.DataType | None = None
87+
) -> pa.Array | pa.ChunkedArray | pa.Scalar:
88+
"""Box value into a pyarrow Array, ChunkedArray or Scalar."""
89+
assert pa_type is None or pa_type == cls._dtype.pyarrow_dtype
90+
91+
if isinstance(value, pa.Scalar) or not (
92+
common.is_list_like(value) and not common.is_dict_like(value)
93+
):
94+
return cls._box_pa_scalar(value)
95+
return cls._box_pa_array(value)
96+
97+
@classmethod
98+
def _box_pa_scalar(cls, value) -> pa.Scalar:
99+
"""Box value into a pyarrow Scalar."""
100+
if pd.isna(value):
101+
pa_scalar = pa.scalar(None, type=cls._dtype.pyarrow_dtype)
102+
else:
103+
value = JSONArray._serialize_json(value)
104+
pa_scalar = pa.scalar(
105+
value, type=cls._dtype.pyarrow_dtype, from_pandas=True
106+
)
107+
108+
return pa_scalar
109+
110+
@classmethod
111+
def _box_pa_array(cls, value, copy: bool = False) -> pa.Array | pa.ChunkedArray:
112+
"""Box value into a pyarrow Array or ChunkedArray."""
113+
if isinstance(value, cls):
114+
pa_array = value._pa_array
115+
else:
116+
value = [JSONArray._serialize_json(x) for x in value]
117+
pa_array = pa.array(value, type=cls._dtype.pyarrow_dtype, from_pandas=True)
118+
return pa_array
119+
120+
@classmethod
121+
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
122+
"""Construct a new ExtensionArray from a sequence of scalars."""
123+
pa_array = cls._box_pa(scalars)
124+
arr = cls(pa_array)
125+
return arr
126+
127+
@staticmethod
128+
def _serialize_json(value):
129+
"""A static method that converts a JSON value into a string representation."""
130+
if not common.is_list_like(value) and pd.isna(value):
131+
return value
132+
else:
133+
# `sort_keys=True` sorts dictionary keys before serialization, making
134+
# JSON comparisons deterministic.
135+
return json.dumps(value, sort_keys=True)
136+
137+
@staticmethod
138+
def _deserialize_json(value):
139+
"""A static method that converts a JSON string back into its original value."""
140+
if not pd.isna(value):
141+
return json.loads(value)
142+
else:
143+
return value
144+
145+
@property
146+
def dtype(self) -> JSONDtype:
147+
"""An instance of JSONDtype"""
148+
return self._dtype
149+
150+
def _cmp_method(self, other, op):
151+
if op.__name__ == "eq":
152+
result = pyarrow.compute.equal(self._pa_array, self._box_pa(other))
153+
elif op.__name__ == "ne":
154+
result = pyarrow.compute.not_equal(self._pa_array, self._box_pa(other))
155+
else:
156+
# Comparison is not a meaningful one. We don't want to support sorting by JSON columns.
157+
raise TypeError(f"{op.__name__} not supported for JSONArray")
158+
return arrays.ArrowExtensionArray(result)
159+
160+
def __getitem__(self, item):
161+
"""Select a subset of self."""
162+
item = indexers.check_array_indexer(self, item)
163+
164+
if isinstance(item, np.ndarray):
165+
if not len(item):
166+
return type(self)(pa.chunked_array([], type=self.dtype.pyarrow_dtype))
167+
elif item.dtype.kind in "iu":
168+
return self.take(item)
169+
else:
170+
# `check_array_indexer` should verify that the assertion hold true.
171+
assert item.dtype.kind == "b"
172+
return type(self)(self._pa_array.filter(item))
173+
elif isinstance(item, tuple):
174+
item = indexers.unpack_tuple_and_ellipses(item)
175+
176+
if common.is_scalar(item) and not common.is_integer(item):
177+
# e.g. "foo" or 2.5
178+
# exception message copied from numpy
179+
raise IndexError(
180+
r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
181+
r"(`None`) and integer or boolean arrays are valid indices"
182+
)
183+
184+
value = self._pa_array[item]
185+
if isinstance(value, pa.ChunkedArray):
186+
return type(self)(value)
187+
else:
188+
scalar = JSONArray._deserialize_json(value.as_py())
189+
if scalar is None:
190+
return self._dtype.na_value
191+
else:
192+
return scalar
193+
194+
def __iter__(self):
195+
"""Iterate over elements of the array."""
196+
for value in self._pa_array:
197+
val = JSONArray._deserialize_json(value.as_py())
198+
if val is None:
199+
yield self._dtype.na_value
200+
else:
201+
yield val
202+
203+
def _reduce(
204+
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
205+
):
206+
"""Return a scalar result of performing the reduction operation."""
207+
if name in ["min", "max"]:
208+
raise TypeError("JSONArray does not support min/max reducntion.")
209+
super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)

‎tests/compliance/json/conftest.py

+181
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# Copyright 2022 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
import json
17+
import random
18+
19+
import numpy as np
20+
import pandas as pd
21+
import pytest
22+
23+
from db_dtypes import JSONArray, JSONDtype
24+
25+
26+
def make_data():
27+
# Since the `np.array` constructor needs a consistent shape after the first
28+
# dimension, the samples data in this instance doesn't include the array type.
29+
samples = [
30+
True, # Boolean
31+
100, # Int
32+
0.98, # Float
33+
"str", # String
34+
{"bool_value": True}, # Dict with a boolean
35+
{"float_num": 3.14159}, # Dict with a float
36+
{"date": "2024-07-16"}, # Dict with a date (as strings)
37+
{"null_field": None}, # Dict with a null
38+
{"list_data": [10, 20, 30]}, # Dict with a list
39+
{"person": {"name": "Alice", "age": 35}}, # Dict with nested objects
40+
{"address": {"street": "123 Main St", "city": "Anytown"}},
41+
{"order": {"items": ["book", "pen"], "total": 15.99}},
42+
]
43+
data = np.random.default_rng(2).choice(samples, size=100)
44+
# This replaces a single data item with an array. We are skipping the first two
45+
# items to avoid some `setitem` tests failed, because setting with a list is
46+
# ambiguity in this context.
47+
id = random.randint(3, 99)
48+
data[id] = [0.1, 0.2] # Array
49+
return data
50+
51+
52+
@pytest.fixture
53+
def dtype():
54+
return JSONDtype()
55+
56+
57+
@pytest.fixture
58+
def data():
59+
"""Length-100 PeriodArray for semantics test."""
60+
data = make_data()
61+
62+
return JSONArray._from_sequence(data)
63+
64+
65+
@pytest.fixture
66+
def data_for_twos(dtype):
67+
"""
68+
Length-100 array in which all the elements are two.
69+
70+
Call pytest.skip in your fixture if the dtype does not support divmod.
71+
"""
72+
pytest.skip(f"{dtype} is not a numeric dtype")
73+
74+
75+
@pytest.fixture
76+
def data_missing():
77+
"""Length 2 array with [NA, Valid]"""
78+
return JSONArray._from_sequence([None, {"a": 10}])
79+
80+
81+
@pytest.fixture
82+
def data_missing_for_sorting():
83+
return JSONArray._from_sequence([json.dumps({"b": 1}), None, json.dumps({"a": 4})])
84+
85+
86+
@pytest.fixture
87+
def na_cmp():
88+
"""
89+
Binary operator for comparing NA values.
90+
91+
Should return a function of two arguments that returns
92+
True if both arguments are (scalar) NA for your type.
93+
94+
By default, uses ``operator.is_``
95+
"""
96+
97+
def cmp(a, b):
98+
return lambda left, right: pd.isna(left) and pd.isna(right)
99+
100+
return cmp
101+
102+
103+
@pytest.fixture
104+
def data_repeated(data):
105+
"""
106+
Generate many datasets.
107+
108+
Parameters
109+
----------
110+
data : fixture implementing `data`
111+
112+
Returns
113+
-------
114+
Callable[[int], Generator]:
115+
A callable that takes a `count` argument and
116+
returns a generator yielding `count` datasets.
117+
"""
118+
119+
def gen(count):
120+
for _ in range(count):
121+
yield data
122+
123+
return gen
124+
125+
126+
_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"]
127+
128+
129+
@pytest.fixture(params=_all_numeric_accumulations)
130+
def all_numeric_accumulations(request):
131+
"""
132+
Fixture for numeric accumulation names
133+
"""
134+
return request.param
135+
136+
137+
_all_boolean_reductions = ["all", "any"]
138+
139+
140+
@pytest.fixture(params=_all_boolean_reductions)
141+
def all_boolean_reductions(request):
142+
"""
143+
Fixture for boolean reduction names.
144+
"""
145+
return request.param
146+
147+
148+
_all_numeric_reductions = [
149+
"count",
150+
"sum",
151+
"max",
152+
"min",
153+
"mean",
154+
"prod",
155+
"std",
156+
"var",
157+
"median",
158+
"kurt",
159+
"skew",
160+
"sem",
161+
]
162+
163+
164+
@pytest.fixture(params=_all_numeric_reductions)
165+
def all_numeric_reductions(request):
166+
"""
167+
Fixture for numeric reduction names.
168+
"""
169+
return request.param
170+
171+
172+
@pytest.fixture(params=["data", "data_missing"])
173+
def all_data(request, data, data_missing):
174+
"""Parametrized fixture returning 'data' or 'data_missing' integer arrays.
175+
176+
Used to test dtype conversion with and without missing values.
177+
"""
178+
if request.param == "data":
179+
return data
180+
elif request.param == "data_missing":
181+
return data_missing
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
# Copyright 2022 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
import typing
17+
18+
import numpy as np
19+
import pandas as pd
20+
import pandas._testing as tm
21+
import pandas.tests.extension.base as base
22+
import pytest
23+
24+
25+
class TestJSONArrayAccumulate(base.BaseAccumulateTests):
26+
pass
27+
28+
29+
class TestJSONArrayCasting(base.BaseCastingTests):
30+
def test_astype_str(self, data):
31+
# Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
32+
result = pd.Series(data[:5]).astype(str)
33+
expected = pd.Series(
34+
[json.dumps(x, sort_keys=True) for x in data[:5]], dtype=str
35+
)
36+
tm.assert_series_equal(result, expected)
37+
38+
@pytest.mark.parametrize(
39+
"nullable_string_dtype",
40+
[
41+
"string[python]",
42+
"string[pyarrow]",
43+
],
44+
)
45+
def test_astype_string(self, data, nullable_string_dtype):
46+
# Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
47+
result = pd.Series(data[:5]).astype(nullable_string_dtype)
48+
expected = pd.Series(
49+
[json.dumps(x, sort_keys=True) for x in data[:5]],
50+
dtype=nullable_string_dtype,
51+
)
52+
tm.assert_series_equal(result, expected)
53+
54+
55+
class TestJSONArrayConstructors(base.BaseConstructorsTests):
56+
def test_from_dtype(self, data):
57+
# construct from our dtype & string dtype
58+
dtype = data.dtype
59+
60+
expected = pd.Series(data)
61+
result = pd.Series(list(data), dtype=dtype)
62+
tm.assert_series_equal(result, expected)
63+
64+
result = pd.Series(list(data), dtype=str(dtype))
65+
tm.assert_series_equal(result, expected)
66+
67+
# Use `{"col1": data}` instead of passing `data` directly to the super method.
68+
# This prevents the DataFrame constructor from attempting to interpret the
69+
# dictionary as column headers.
70+
71+
# gh-30280
72+
expected = pd.DataFrame({"col1": data}).astype(dtype)
73+
result = pd.DataFrame({"col1": list(data)}, dtype=dtype)
74+
tm.assert_frame_equal(result, expected)
75+
76+
result = pd.DataFrame({"col1": list(data)}, dtype=str(dtype))
77+
tm.assert_frame_equal(result, expected)
78+
79+
def test_series_constructor_scalar_with_index(self, data, dtype):
80+
# Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
81+
# This prevents the Series constructor from attempting to interpret the dictionary
82+
# as column headers.
83+
scalar = json.dumps(data[0])
84+
result = pd.Series(scalar, index=[1, 2, 3], dtype=dtype)
85+
expected = pd.Series([scalar] * 3, index=[1, 2, 3], dtype=dtype)
86+
tm.assert_series_equal(result, expected)
87+
88+
result = pd.Series(scalar, index=["foo"], dtype=dtype)
89+
expected = pd.Series([scalar], index=["foo"], dtype=dtype)
90+
tm.assert_series_equal(result, expected)
91+
92+
93+
@pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.")
94+
class TestJSONArrayGroupby(base.BaseGroupbyTests):
95+
pass
96+
97+
98+
class TestJSONArrayDtype(base.BaseDtypeTests):
99+
pass
100+
101+
102+
class TestJSONArrayGetitem(base.BaseGetitemTests):
103+
@pytest.mark.xfail(reason="JSONDtype's type returns its storage type.")
104+
def test_getitem_scalar(self, data):
105+
"""
106+
`_getitem_` can return any JSON-types objects while `data.dtype.type` returns
107+
a string to indicate its storage type.
108+
> assert isinstance(result, data.dtype.type)
109+
E AssertionError
110+
"""
111+
super().test_getitem_scalar(data)
112+
113+
114+
class TestJSONArrayIndex(base.BaseIndexTests):
115+
pass
116+
117+
118+
class TestJSONArrayInterface(base.BaseInterfaceTests):
119+
def test_array_interface(self, data):
120+
result = np.array(data)
121+
# Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method.
122+
assert result[0] == json.dumps(data[0])
123+
124+
result = np.array(data, dtype=object)
125+
# Use `json.dumps(x)` instead of passing `x` directly to the super method.
126+
expected = np.array([json.dumps(x) for x in data], dtype=object)
127+
# if expected.ndim > 1:
128+
# # nested data, explicitly construct as 1D
129+
# expected = construct_1d_object_array_from_listlike(list(data))
130+
tm.assert_numpy_array_equal(result, expected)
131+
132+
@pytest.mark.skip(reason="2D support not implemented for JSONArray")
133+
def test_view(self, data):
134+
super().test_view(data)
135+
136+
137+
class TestJSONArrayParsing(base.BaseParsingTests):
138+
@pytest.mark.xfail(reason="data type 'json' not understood")
139+
@pytest.mark.parametrize("engine", ["c", "python"])
140+
def test_EA_types(self, engine, data, request):
141+
super().test_EA_types(engine, data, request)
142+
143+
144+
class TestJSONArrayMethods(base.BaseMethodsTests):
145+
@pytest.mark.xfail(reason="Unhashable")
146+
def test_value_counts_with_normalize(self, data):
147+
super().test_value_counts_with_normalize(data)
148+
149+
@pytest.mark.skip("fill-value is interpreted as a dict of values")
150+
def test_fillna_copy_frame(self, data_missing):
151+
super().test_fillna_copy_frame(data_missing)
152+
153+
@pytest.mark.xfail(reason="combine for JSONArray not supported")
154+
def test_combine_le(self, data_repeated):
155+
super().test_combine_le(data_repeated)
156+
157+
@pytest.mark.skip(reason="'<' not supported between instances of 'dict' and 'dict'")
158+
def test_searchsorted(self, data_for_sorting, as_series):
159+
super().test_searchsorted(self, data_for_sorting, as_series)
160+
161+
@pytest.mark.xfail(
162+
reason="`to_numpy` returns serialized JSON, "
163+
+ "while `__getitem__` returns JSON objects."
164+
)
165+
def test_where_series(self, data, na_value, as_frame):
166+
# `Series.where` calls `to_numpy` to get results.
167+
super().test_where_series(data, na_value, as_frame)
168+
169+
@pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.")
170+
def test_factorize(self, data_for_grouping):
171+
super().test_factorize(data_for_grouping)
172+
173+
@pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.")
174+
def test_factorize_equivalence(self, data_for_grouping):
175+
super().test_factorize_equivalence(data_for_grouping)
176+
177+
@pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.")
178+
def test_argsort(self, data_for_sorting):
179+
super().test_argsort(data_for_sorting)
180+
181+
@pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.")
182+
def test_argmin_argmax(self, data_for_sorting):
183+
super().test_argmin_argmax(data_for_sorting)
184+
185+
@pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.")
186+
def test_sort_values(self, data_for_sorting):
187+
super().test_sort_values(data_for_sorting)
188+
189+
@pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.")
190+
def test_sort_values_frame(self, data_for_sorting):
191+
super().test_sort_values_frame(data_for_sorting)
192+
193+
194+
class TestJSONArrayMissing(base.BaseMissingTests):
195+
@pytest.mark.xfail(reason="Setting a dict as a scalar")
196+
def test_fillna_series(self):
197+
"""We treat dictionaries as a mapping in fillna, not a scalar."""
198+
super().test_fillna_series()
199+
200+
@pytest.mark.xfail(reason="Setting a dict as a scalar")
201+
def test_fillna_frame(self):
202+
"""We treat dictionaries as a mapping in fillna, not a scalar."""
203+
super().test_fillna_frame()
204+
205+
206+
@pytest.mark.skip(reason="BigQuery JSON does not allow Arithmetic Ops.")
207+
class TestJSONArrayArithmeticOps(base.BaseArithmeticOpsTests):
208+
pass
209+
210+
211+
class TestJSONArrayComparisonOps(base.BaseComparisonOpsTests):
212+
def test_compare_array(self, data, comparison_op, request):
213+
if comparison_op.__name__ not in ["eq", "ne"]:
214+
mark = pytest.mark.xfail(reason="Comparison methods not implemented")
215+
request.applymarker(mark)
216+
super().test_compare_array(data, comparison_op)
217+
218+
def test_compare_scalar(self, data, comparison_op, request):
219+
if comparison_op.__name__ not in ["eq", "ne"]:
220+
mark = pytest.mark.xfail(reason="Comparison methods not implemented")
221+
request.applymarker(mark)
222+
super().test_compare_scalar(data, comparison_op)
223+
224+
def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
225+
dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj))
226+
if op_name in ["__add__", "__radd__"]:
227+
cast_to = dtype
228+
else:
229+
cast_to = "boolean[pyarrow]" # type: ignore[assignment]
230+
return pointwise_result.astype(cast_to)
231+
232+
233+
class TestJSONArrayUnaryOps(base.BaseUnaryOpsTests):
234+
pass
235+
236+
237+
class TestJSONArrayPrinting(base.BasePrintingTests):
238+
pass
239+
240+
241+
class TestJSONArrayReduce(base.BaseReduceTests):
242+
pass
243+
244+
245+
class TestJSONArrayReshaping(base.BaseReshapingTests):
246+
@pytest.mark.skip(reason="2D support not implemented for JSONArray")
247+
def test_transpose(self, data):
248+
super().test_transpose(data)
249+
250+
@pytest.mark.xfail(
251+
reason="`to_numpy` returns serialized JSON, "
252+
+ "while `__getitem__` returns JSON objects."
253+
)
254+
def test_transpose_frame(self, data):
255+
# `DataFrame.T` calls `to_numpy` to get results.
256+
super().test_transpose_frame(data)
257+
258+
259+
class TestJSONArraySetitem(base.BaseSetitemTests):
260+
# Patching `[....] * len()` to base.BaseSetitemTests because pandas' internals
261+
# has trouble setting sequences of values into scalar positions.
262+
263+
@pytest.mark.parametrize(
264+
"idx",
265+
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
266+
ids=["list", "integer-array", "numpy-array"],
267+
)
268+
def test_setitem_integer_array(self, data, idx, box_in_series):
269+
arr = data[:5].copy()
270+
expected = data.take([0, 0, 0, 3, 4])
271+
272+
if box_in_series:
273+
arr = pd.Series(arr)
274+
expected = pd.Series(expected)
275+
276+
# Use `[arr[0]] * len()` instead of passing `arr[0]` directly to the super method.
277+
arr[idx] = [arr[0]] * len(arr[idx])
278+
tm.assert_equal(arr, expected)
279+
280+
@pytest.mark.parametrize(
281+
"mask",
282+
[
283+
np.array([True, True, True, False, False]),
284+
pd.array([True, True, True, False, False], dtype="boolean"),
285+
pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
286+
],
287+
ids=["numpy-array", "boolean-array", "boolean-array-na"],
288+
)
289+
def test_setitem_mask(self, data, mask, box_in_series):
290+
arr = data[:5].copy()
291+
expected = arr.take([0, 0, 0, 3, 4])
292+
if box_in_series:
293+
arr = pd.Series(arr)
294+
expected = pd.Series(expected)
295+
# Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method.
296+
arr[mask] = [data[0]] * len(arr[mask])
297+
tm.assert_equal(expected, arr)
298+
299+
def test_setitem_loc_iloc_slice(self, data):
300+
arr = data[:5].copy()
301+
s = pd.Series(arr, index=["a", "b", "c", "d", "e"])
302+
expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index)
303+
304+
result = s.copy()
305+
# Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method.
306+
result.iloc[:3] = [data[0]] * len(result.iloc[:3])
307+
tm.assert_equal(result, expected)
308+
309+
result = s.copy()
310+
result.loc[:"c"] = [data[0]] * len(result.loc[:"c"])
311+
tm.assert_equal(result, expected)
312+
313+
def test_setitem_slice(self, data, box_in_series):
314+
arr = data[:5].copy()
315+
expected = data.take([0, 0, 0, 3, 4])
316+
if box_in_series:
317+
arr = pd.Series(arr)
318+
expected = pd.Series(expected)
319+
320+
# Use `[data[0]] * 3` instead of passing `data[0]` directly to the super method.
321+
arr[:3] = [data[0]] * 3
322+
tm.assert_equal(arr, expected)
323+
324+
@pytest.mark.xfail(reason="only integer scalar arrays can be converted")
325+
def test_setitem_2d_values(self, data):
326+
super().test_setitem_2d_values(data)
327+
328+
@pytest.mark.xfail(
329+
reason="`to_numpy` returns serialized JSON, "
330+
+ "while `__getitem__` returns JSON objects."
331+
)
332+
def test_setitem_frame_2d_values(self, data):
333+
super().test_setitem_frame_2d_values(data)
334+
335+
@pytest.mark.parametrize("setter", ["loc", None])
336+
def test_setitem_mask_broadcast(self, data, setter):
337+
ser = pd.Series(data)
338+
mask = np.zeros(len(data), dtype=bool)
339+
mask[:2] = True
340+
341+
if setter: # loc
342+
target = getattr(ser, setter)
343+
else: # __setitem__
344+
target = ser
345+
346+
# Use `[data[10]] * len()` instead of passing `data[10]` directly to the super method.
347+
target[mask] = [data[10]] * len(target[mask])
348+
assert ser[0] == data[10]
349+
assert ser[1] == data[10]
350+
351+
@pytest.mark.xfail(reason="eq not implemented for <class 'dict'>")
352+
def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
353+
super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
354+
355+
@pytest.mark.skip(reason="2D support not implemented for JSONArray")
356+
def test_setitem_preserves_views(self, data):
357+
super().test_setitem_preserves_views(data)
358+
359+
360+
class TestJSONArrayDim2Compat(base.Dim2CompatTests):
361+
pass

‎tests/unit/test_json.py

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
import json
17+
18+
import pandas as pd
19+
import pytest
20+
21+
import db_dtypes
22+
23+
# Check for minimum Pandas version.
24+
pytest.importorskip("pandas", minversion="1.5.0")
25+
26+
27+
# Python data types mirroring all standard JSON types:
28+
# https://json-schema.org/understanding-json-schema/reference/type
29+
JSON_DATA = {
30+
"boolean": True,
31+
"int": 100,
32+
"float": 0.98,
33+
"string": "hello world",
34+
"array": [0.1, 0.2],
35+
"dict": {
36+
"null_field": None,
37+
"order": {
38+
"items": ["book", "pen", "computer"],
39+
"total": 15.99,
40+
"address": {"street": "123 Main St", "city": "Anytown"},
41+
},
42+
},
43+
"null": None,
44+
}
45+
46+
47+
def test_construct_w_unspported_types():
48+
with pytest.raises(ValueError):
49+
db_dtypes.JSONArray(100)
50+
51+
52+
def test_getitems_return_json_objects():
53+
data = db_dtypes.JSONArray._from_sequence(JSON_DATA.values())
54+
for id, key in enumerate(JSON_DATA.keys()):
55+
if key == "null":
56+
assert pd.isna(data[id])
57+
else:
58+
assert data[id] == JSON_DATA[key]
59+
60+
61+
def test_getitems_w_unboxed_dict():
62+
data = db_dtypes.JSONArray._from_sequence([JSON_DATA["dict"]])
63+
assert len(data[0]) == 2
64+
65+
assert data[0]["null_field"] is None
66+
assert data[0]["order"]["address"]["city"] == "Anytown"
67+
assert len(data[0]["order"]["items"]) == 3
68+
assert data[0]["order"]["items"][0] == "book"
69+
70+
with pytest.raises(KeyError):
71+
data[0]["unknown"]
72+
73+
74+
def test_getitems_when_iter_with_null():
75+
data = db_dtypes.JSONArray._from_sequence([JSON_DATA["null"]])
76+
s = pd.Series(data)
77+
result = s[:1].item()
78+
assert pd.isna(result)
79+
80+
81+
def test_to_numpy():
82+
s = pd.Series(db_dtypes.JSONArray._from_sequence(JSON_DATA.values()))
83+
data = s.to_numpy()
84+
for id, key in enumerate(JSON_DATA.keys()):
85+
if key == "null":
86+
assert pd.isna(data[id])
87+
else:
88+
assert data[id] == json.dumps(JSON_DATA[key], sort_keys=True)
89+
90+
91+
def test_deterministic_json_serialization():
92+
x = {"a": 0, "b": 1}
93+
y = {"b": 1, "a": 0}
94+
data = db_dtypes.JSONArray._from_sequence([x])
95+
assert y in data

0 commit comments

Comments
 (0)
Please sign in to comment.