From 1a913156d236ae0666ba5c395f4ed166011d8c42 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Fri, 22 Dec 2023 10:34:46 -0800 Subject: [PATCH 1/6] Add test of Arrow roundtrip of boolean values --- pyogrio/tests/test_arrow.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index dc62bff1..ea8cf1e2 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -4,8 +4,9 @@ import pytest +import numpy as np from pyogrio import __gdal_version__, read_dataframe -from pyogrio.raw import open_arrow, read_arrow +from pyogrio.raw import open_arrow, read_arrow, write from pyogrio.tests.conftest import requires_arrow_api try: @@ -205,3 +206,22 @@ def test_enable_with_environment_variable(test_ogr_types_list): with use_arrow_context(): result = read_dataframe(test_ogr_types_list) assert "list_int64" in result.columns + + +def test_arrow_bool_roundtrip(tmpdir): + filename = os.path.join(str(tmpdir), "test.gpkg") + + # Point(0, 0) + geometry = np.array( + [bytes.fromhex("010100000000000000000000000000000000000000")] * 5, dtype=object + ) + bool_col = np.array([True, False, True, False, True]) + field_data = [bool_col] + fields = ["bool_col"] + + write( + filename, geometry, field_data, fields, geometry_type="Point", crs="EPSG:4326" + ) + table = read_arrow(filename)[1] + + assert np.array_equal(table["bool_col"].to_numpy(), bool_col) From 11fc649f68ee7f59db83b92284fa79b56fd347b6 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Mon, 22 Jan 2024 16:52:54 -0800 Subject: [PATCH 2/6] Upgrade CI tests to GDAL 3.8.3 --- .github/workflows/docker-gdal.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-gdal.yml b/.github/workflows/docker-gdal.yml index 37ac5d23..a626b181 100644 --- a/.github/workflows/docker-gdal.yml +++ b/.github/workflows/docker-gdal.yml @@ -21,7 +21,7 @@ jobs: matrix: container: - "ghcr.io/osgeo/gdal:ubuntu-small-latest" # >= python 3.10.6 - - "ghcr.io/osgeo/gdal:ubuntu-small-3.8.1" # python 3.10.12 + - "ghcr.io/osgeo/gdal:ubuntu-small-3.8.3" # python 3.10.12 - "ghcr.io/osgeo/gdal:ubuntu-small-3.7.3" # python 3.10.12 - "ghcr.io/osgeo/gdal:ubuntu-small-3.6.4" # python 3.10.6 - "osgeo/gdal:ubuntu-small-3.5.3" # python 3.8.10 From ef39b53e3c8a1100888f1febd3933e8e85487d9d Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 22 Jan 2024 17:45:19 -0800 Subject: [PATCH 3/6] Expand tests, raise exception when bool values detected for GDAL <3.8.3 --- CHANGES.md | 3 +++ pyogrio/raw.py | 14 ++++++++++- pyogrio/tests/test_arrow.py | 30 +++++++++++++++++++++++ pyogrio/tests/test_geopandas_io.py | 39 ++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 0e5556ee..6a237497 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,9 @@ - Fix error in `write_dataframe` if input has a date column and non-consecutive index values (#325). +- Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if + a boolean column is detected due to error in GDAL reading boolean values (#335) + this has been fixed in GDAL >= 3.8.3. ## 0.7.2 (2023-10-30) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 4384ee9a..6499867a 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -258,6 +258,8 @@ def read_arrow( """ from pyarrow import Table + gdal_version = get_gdal_version() + if skip_features < 0: raise ValueError("'skip_features' must be >= 0") @@ -275,7 +277,7 @@ def read_arrow( # handle skip_features internally within open_arrow if GDAL >= 3.8.0 gdal_skip_features = 0 - if get_gdal_version() >= (3, 8, 0): + if gdal_version >= (3, 8, 0): gdal_skip_features = skip_features skip_features = 0 @@ -328,6 +330,16 @@ def read_arrow( else: table = reader.read_all() + # raise error if schema has bool values and GDAL <3.8.3 + # due to https://github.com/OSGeo/gdal/issues/8998 + if gdal_version < (3, 8, 3) and any( + [col_type == "bool" for col_type in table.schema.types] + ): + raise RuntimeError( + "GDAL < 3.8.3 does not correctly read boolean data values using the " + "Arrow API. Do not use read_arrow() / use_arrow=True for this dataset." + ) + return meta, table diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index ea8cf1e2..62ad5a50 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -208,6 +208,10 @@ def test_enable_with_environment_variable(test_ogr_types_list): assert "list_int64" in result.columns +@pytest.mark.xfail( + __gdal_version__ < (3, 8, 3), + reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8998", +) def test_arrow_bool_roundtrip(tmpdir): filename = os.path.join(str(tmpdir), "test.gpkg") @@ -225,3 +229,29 @@ def test_arrow_bool_roundtrip(tmpdir): table = read_arrow(filename)[1] assert np.array_equal(table["bool_col"].to_numpy(), bool_col) + + +@pytest.mark.skipif( + __gdal_version__ >= (3, 8, 8), reason="Arrow bool value bug fixed in GDAL >= 3.8.3" +) +def test_arrow_bool_exception(tmpdir): + filename = os.path.join(str(tmpdir), "test.gpkg") + + # Point(0, 0) + geometry = np.array( + [bytes.fromhex("010100000000000000000000000000000000000000")] * 5, dtype=object + ) + bool_col = np.array([True, False, True, False, True]) + field_data = [bool_col] + fields = ["bool_col"] + + write( + filename, geometry, field_data, fields, geometry_type="Point", crs="EPSG:4326" + ) + + with pytest.raises( + RuntimeError, + match="GDAL < 3.8.3 does not correctly read boolean data values using " + "the Arrow API", + ): + read_arrow(filename) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 9b20c392..a9e1acd4 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1469,3 +1469,42 @@ def test_read_dataframe_arrow_dtypes(tmp_path): assert isinstance(result["col"].dtype, pd.ArrowDtype) result["col"] = result["col"].astype("float64") assert_geodataframe_equal(result, df) + + +@requires_arrow_api +@pytest.mark.xfail( + __gdal_version__ < (3, 8, 3), + reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8998", +) +def test_arrow_bool_roundtrip(tmpdir): + filename = os.path.join(str(tmpdir), "test.gpkg") + + df = gp.GeoDataFrame( + {{"bool_col": [True, False, True, False, True], "geometry": [Point(0, 0)] * 5}}, + crs="EPSG:4326", + ) + + write_dataframe(df, filename) + result = read_dataframe(filename, use_arrow=True) + assert_geodataframe_equal(result, df) + + +@pytest.mark.skipif( + __gdal_version__ >= (3, 8, 8), reason="Arrow bool value bug fixed in GDAL >= 3.8.3" +) +def test_arrow_bool_exception(tmpdir): + filename = os.path.join(str(tmpdir), "test.gpkg") + + df = gp.GeoDataFrame( + {"bool_col": [True, False, True, False, True], "geometry": [Point(0, 0)] * 5}, + crs="EPSG:4326", + ) + + write_dataframe(df, filename) + + with pytest.raises( + RuntimeError, + match="GDAL < 3.8.3 does not correctly read boolean data values using " + "the Arrow API", + ): + read_dataframe(filename, use_arrow=True) From 2e8074e1e2fb0a123b709aee21c3fe416f39efdc Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 22 Jan 2024 17:51:53 -0800 Subject: [PATCH 4/6] Fix typos --- pyogrio/tests/test_arrow.py | 2 +- pyogrio/tests/test_geopandas_io.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index 62ad5a50..38d0f3d8 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -232,7 +232,7 @@ def test_arrow_bool_roundtrip(tmpdir): @pytest.mark.skipif( - __gdal_version__ >= (3, 8, 8), reason="Arrow bool value bug fixed in GDAL >= 3.8.3" + __gdal_version__ >= (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3" ) def test_arrow_bool_exception(tmpdir): filename = os.path.join(str(tmpdir), "test.gpkg") diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index a9e1acd4..7b4ef401 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1480,7 +1480,7 @@ def test_arrow_bool_roundtrip(tmpdir): filename = os.path.join(str(tmpdir), "test.gpkg") df = gp.GeoDataFrame( - {{"bool_col": [True, False, True, False, True], "geometry": [Point(0, 0)] * 5}}, + {"bool_col": [True, False, True, False, True], "geometry": [Point(0, 0)] * 5}, crs="EPSG:4326", ) @@ -1490,7 +1490,7 @@ def test_arrow_bool_roundtrip(tmpdir): @pytest.mark.skipif( - __gdal_version__ >= (3, 8, 8), reason="Arrow bool value bug fixed in GDAL >= 3.8.3" + __gdal_version__ >= (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3" ) def test_arrow_bool_exception(tmpdir): filename = os.path.join(str(tmpdir), "test.gpkg") From 401817326743faeb913c4a503dda122b7e7d0051 Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 22 Jan 2024 18:07:06 -0800 Subject: [PATCH 5/6] Add missing skip for pyarrow --- pyogrio/tests/test_geopandas_io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 7b4ef401..cb0b3895 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1489,6 +1489,7 @@ def test_arrow_bool_roundtrip(tmpdir): assert_geodataframe_equal(result, df) +@requires_arrow_api @pytest.mark.skipif( __gdal_version__ >= (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3" ) From 027d485e70f839c780cfe4e88d50668ae79ed024 Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Tue, 23 Jan 2024 10:10:15 -0800 Subject: [PATCH 6/6] Switch bool test xfail to skipif for known failing GDAL versions --- pyogrio/tests/test_arrow.py | 5 ++--- pyogrio/tests/test_geopandas_io.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index 38d0f3d8..aae55b9c 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -208,9 +208,8 @@ def test_enable_with_environment_variable(test_ogr_types_list): assert "list_int64" in result.columns -@pytest.mark.xfail( - __gdal_version__ < (3, 8, 3), - reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8998", +@pytest.mark.skipif( + __gdal_version__ < (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3" ) def test_arrow_bool_roundtrip(tmpdir): filename = os.path.join(str(tmpdir), "test.gpkg") diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index cb0b3895..6b7564b9 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1472,9 +1472,8 @@ def test_read_dataframe_arrow_dtypes(tmp_path): @requires_arrow_api -@pytest.mark.xfail( - __gdal_version__ < (3, 8, 3), - reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8998", +@pytest.mark.skipif( + __gdal_version__ < (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3" ) def test_arrow_bool_roundtrip(tmpdir): filename = os.path.join(str(tmpdir), "test.gpkg")