From 28554a49d98d69afb9a141084574dfd45e062d99 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Sep 2023 22:29:39 +0200 Subject: [PATCH 01/17] Get layer bounds via pyogrio.read_info --- pyogrio/_io.pyx | 44 ++++++++++++++++++++++++++++++++------ pyogrio/_ogr.pxd | 1 + pyogrio/core.py | 22 +++++++++++++++++-- pyogrio/tests/test_core.py | 26 ++++++++++++++++------ 4 files changed, 79 insertions(+), 14 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 2540961e..3683c87d 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -320,7 +320,7 @@ cdef get_driver(OGRDataSourceH ogr_dataset): return driver -cdef get_feature_count(OGRLayerH ogr_layer): +cdef get_feature_count(OGRLayerH ogr_layer, int force): """Get the feature count of a layer. If GDAL returns an unknown count (-1), this iterates over every feature @@ -329,6 +329,8 @@ cdef get_feature_count(OGRLayerH ogr_layer): Parameters ---------- ogr_layer : pointer to open OGR layer + force : bool + True if the feature count should be computed even if it is expensive Returns ------- @@ -337,12 +339,12 @@ cdef get_feature_count(OGRLayerH ogr_layer): """ cdef OGRFeatureH ogr_feature = NULL - cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, 1) + cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, force) # if GDAL refuses to give us the feature count, we have to loop over all # features ourselves and get the count. This can happen for some drivers # (e.g., OSM) or if a where clause is invalid but not rejected as error - if feature_count == -1: + if force and feature_count == -1: # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) @@ -376,6 +378,33 @@ cdef get_feature_count(OGRLayerH ogr_layer): return feature_count +cdef get_total_bounds(OGRLayerH ogr_layer, int force): + """Get the total bounds of a layer. + + Parameters + ---------- + ogr_layer : pointer to open OGR layer + force : bool + True if the total bounds should be computed even if it is expensive + + Returns + ------- + Optional[Tuple[float, float, float, float]] + The total bounds of the layer, or None if they could not be determined. + """ + + cdef OGREnvelope ogr_envelope # = NULL + try: + exc_wrap_ogrerr(OGR_L_GetExtent(ogr_layer, &ogr_envelope, force)) + bounds = ( + ogr_envelope.MinX, ogr_envelope.MinY, ogr_envelope.MaxX, ogr_envelope.MaxY + ) + except CPLE_BaseError: + bounds = None + + return bounds + + cdef set_metadata(GDALMajorObjectH obj, object metadata): """Set metadata on a dataset or layer @@ -598,7 +627,7 @@ cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_fe skip_features : number of features to skip from beginning of available range max_features : maximum number of features to read from available range """ - feature_count = get_feature_count(ogr_layer) + feature_count = get_feature_count(ogr_layer, 1) num_features = max_features if feature_count == 0: @@ -1357,7 +1386,9 @@ def ogr_read_info( str path, dataset_kwargs, object layer=None, - object encoding=None): + object encoding=None, + int force_featurecount=False, + int force_total_bounds=False): cdef const char *path_c = NULL cdef char **dataset_options = NULL @@ -1392,7 +1423,8 @@ def ogr_read_info( 'fields': fields[:,2], # return only names 'dtypes': fields[:,3], 'geometry_type': get_geometry_type(ogr_layer), - 'features': get_feature_count(ogr_layer), + 'features': get_feature_count(ogr_layer, force_featurecount), + 'total_bounds': get_total_bounds(ogr_layer, force_total_bounds), 'driver': get_driver(ogr_dataset), "capabilities": { "random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead), diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index 88f12d9d..d5dc572e 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -280,6 +280,7 @@ cdef extern from "ogr_api.h": const char* OGR_L_GetName(OGRLayerH layer) const char* OGR_L_GetFIDColumn(OGRLayerH layer) const char* OGR_L_GetGeometryColumn(OGRLayerH layer) + OGRErr OGR_L_GetExtent(OGRLayerH layer, OGREnvelope *psExtent, int bForce) OGRSpatialReferenceH OGR_L_GetSpatialRef(OGRLayerH layer) int OGR_L_TestCapability(OGRLayerH layer, const char *name) OGRFeatureDefnH OGR_L_GetLayerDefn(OGRLayerH layer) diff --git a/pyogrio/core.py b/pyogrio/core.py index 30552739..75663bc2 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -148,7 +148,15 @@ def read_bounds( return result -def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs): +def read_info( + path_or_buffer, + /, + layer=None, + encoding=None, + force_featurecount=False, + force_total_bounds=False, + **kwargs, +): """Read information about an OGR data source. ``crs`` and ``geometry`` will be ``None`` and ``features`` will be 0 for a @@ -163,6 +171,10 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs): If present, will be used as the encoding for reading string values from the data source, unless encoding can be inferred directly from the data source. + force_featurecount : bool, optional (default: False) + True if the feature count should be computed even if it is expensive. + force_total_bounds : bool, optional (default: False) + True if the total bounds should be computed even if it is expensive. **kwargs Additional driver-specific dataset open options passed to OGR. Invalid options will trigger a warning. @@ -179,6 +191,7 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs): "encoding": "", "geometry": "", "features": , + "total_bounds": "driver": "", "dataset_metadata" "" "layer_metadata" "" @@ -190,7 +203,12 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs): try: result = ogr_read_info( - path, layer=layer, encoding=encoding, dataset_kwargs=dataset_kwargs + path, + layer=layer, + encoding=encoding, + force_featurecount=force_featurecount, + force_total_bounds=force_total_bounds, + dataset_kwargs=dataset_kwargs, ) finally: if buffer is not None: diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index 538d6512..066d56e1 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -226,6 +226,7 @@ def test_read_info(naturalearth_lowres): assert meta["fields"].shape == (5,) assert meta["dtypes"].tolist() == ["int64", "object", "object", "object", "float64"] assert meta["features"] == 177 + assert meta["total_bounds"] == (-180, -90, 180.00000000000006, 83.64513000000001) assert meta["driver"] == "ESRI Shapefile" @@ -268,19 +269,32 @@ def test_read_info_invalid_dataset_kwargs(naturalearth_lowres): def test_read_info_force_feature_count_exception(data_dir): with pytest.raises(DataLayerError, match="Could not iterate over features"): - read_info(data_dir / "sample.osm.pbf", layer="lines") + read_info(data_dir / "sample.osm.pbf", layer="lines", force_featurecount=True) -def test_read_info_force_feature_count(data_dir): +@pytest.mark.parametrize( + "force_featurecount, exp_featurecount, exp_featurecount_lines", + [(True, 8, 36), (False, -1, -1)], +) +def test_read_info_force_feature_count( + data_dir, force_featurecount, exp_featurecount, exp_featurecount_lines +): # the sample OSM file has non-increasing node IDs which causes the default # custom indexing to raise an exception iterating over features - meta = read_info(data_dir / "sample.osm.pbf", USE_CUSTOM_INDEXING=False) - assert meta["features"] == 8 + meta = read_info( + data_dir / "sample.osm.pbf", + force_featurecount=force_featurecount, + USE_CUSTOM_INDEXING=False, + ) + assert meta["features"] == exp_featurecount meta = read_info( - data_dir / "sample.osm.pbf", layer="lines", USE_CUSTOM_INDEXING=False + data_dir / "sample.osm.pbf", + layer="lines", + force_featurecount=force_featurecount, + USE_CUSTOM_INDEXING=False, ) - assert meta["features"] == 36 + assert meta["features"] == exp_featurecount_lines @pytest.mark.parametrize( From ded790dafffef1bcc42db4823d1dd6e82233f714 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Sep 2023 22:36:19 +0200 Subject: [PATCH 02/17] Update CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index e614d343..86156860 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,6 +10,7 @@ unknown count for a data layer (e.g., OSM driver); this may have signficant performance impacts for some data sources that would otherwise return an unknown count (count is used in `read_info`, `read`, `read_dataframe`) (#271). +- Add `total_bounds` of layer to `read_info` (#281) ### Bug fixes From 53c5d991ae8deb1ba4cb27f94e5ae268d77e8b14 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Sep 2023 22:47:11 +0200 Subject: [PATCH 03/17] Fix gml tests -> force_featurecount needed --- pyogrio/core.py | 2 +- pyogrio/tests/test_raw_io.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyogrio/core.py b/pyogrio/core.py index 75663bc2..3241da75 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -191,7 +191,7 @@ def read_info( "encoding": "", "geometry": "", "features": , - "total_bounds": + "total_bounds": , "driver": "", "dataset_metadata" "" "layer_metadata" "" diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index e07384e3..55f27b17 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -472,7 +472,7 @@ def test_write_append_unsupported(tmpdir, naturalearth_lowres, driver, ext): assert os.path.exists(filename) - assert read_info(filename)["features"] == 177 + assert read_info(filename, force_featurecount=True)["features"] == 177 with pytest.raises(DataSourceError): write(filename, geometry, field_data, driver=driver, append=True, **meta) From fd1fd17ac8fbbd39c7a5f17ea2d75ea3115014ab Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Sep 2023 23:09:51 +0200 Subject: [PATCH 04/17] Add test on force_total_bounds --- pyogrio/tests/test_core.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index 066d56e1..d51320a5 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -12,6 +12,7 @@ get_gdal_config_option, get_gdal_data_path, ) +from pyogrio.tests.conftest import prepare_testfile from pyogrio.errors import DataSourceError, DataLayerError from pyogrio._env import GDALEnv @@ -297,6 +298,21 @@ def test_read_info_force_feature_count( assert meta["features"] == exp_featurecount_lines +@pytest.mark.parametrize( + "force_total_bounds, expected_total_bounds", + [(True, (-180.0, -90.0, 180.0, 83.64513)), (False, None)], +) +def test_read_info_force_total_bounds( + tmpdir, naturalearth_lowres, force_total_bounds, expected_total_bounds +): + # Geojson files don't hava a fast way to determine total_bounds + geojson_path = prepare_testfile(naturalearth_lowres, dst_dir=tmpdir, ext=".geojson") + assert ( + read_info(geojson_path, force_total_bounds=force_total_bounds)["total_bounds"] + == expected_total_bounds + ) + + @pytest.mark.parametrize( "name,value,expected", [ From f8baff5a609eaae498b9f2dae6ae8b6196d52aef Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Sep 2023 23:31:53 +0200 Subject: [PATCH 05/17] Improve doc --- pyogrio/core.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pyogrio/core.py b/pyogrio/core.py index 3241da75..09b8fd22 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -159,8 +159,14 @@ def read_info( ): """Read information about an OGR data source. - ``crs`` and ``geometry`` will be ``None`` and ``features`` will be 0 for a - nonspatial layer. + ``crs``, ``geometry`` and ``total_bounds`` will be ``None`` and ``features`` will be + 0 for a nonspatial layer. + + ``features`` will be -1 if determining this is expensive. You can force to calculate + it anyway using the ``force_featurecount`` parameter. + + ``total_bounds`` will be None if determining this is expensive. You can force to + calculate it anyway using the ``force_total_bounds`` parameter. Parameters ---------- @@ -190,8 +196,8 @@ def read_info( "dtypes": , "encoding": "", "geometry": "", - "features": , - "total_bounds": , + "features": , + "total_bounds": , "driver": "", "dataset_metadata" "" "layer_metadata" "" From ce587d68d646093a48604316bd3a6d70a82d1630 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 4 Sep 2023 23:32:28 +0200 Subject: [PATCH 06/17] Add test on read_info for file without geometry --- pyogrio/tests/test_core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index d51320a5..d5092ad0 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -313,6 +313,10 @@ def test_read_info_force_total_bounds( ) +def test_read_info_without_geometry(test_fgdb_vsi): + assert read_info(test_fgdb_vsi)["total_bounds"] is None + + @pytest.mark.parametrize( "name,value,expected", [ From d7eef5ea4974183e74274d9b0ba701865ed44101 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 5 Sep 2023 23:55:21 +0200 Subject: [PATCH 07/17] Improve doc per suggestions --- pyogrio/core.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pyogrio/core.py b/pyogrio/core.py index 09b8fd22..be5b12db 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -162,11 +162,13 @@ def read_info( ``crs``, ``geometry`` and ``total_bounds`` will be ``None`` and ``features`` will be 0 for a nonspatial layer. - ``features`` will be -1 if determining this is expensive. You can force to calculate - it anyway using the ``force_featurecount`` parameter. + ``features`` will be -1 if this is an expensive operation for this driver. You can + force it to be calculated using the ``force_featurecount`` parameter. - ``total_bounds`` will be None if determining this is expensive. You can force to - calculate it anyway using the ``force_total_bounds`` parameter. + ``total_bounds`` is the 2-dimensional extent of all features within the dataset: + (xmin, ymin, xmax, ymax). It will be None if this operation is expensive for this + driver or if the data source is nonspatial. You can force it to be calculated using + the ``force_total_bounds`` parameter. Parameters ---------- From 72eed72d879b3c5588996390f7e010b852ff263b Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 6 Sep 2023 00:03:54 +0200 Subject: [PATCH 08/17] Rename featurecount to feature_count --- pyogrio/_io.pyx | 4 ++-- pyogrio/core.py | 8 ++++---- pyogrio/tests/test_core.py | 14 +++++++------- pyogrio/tests/test_raw_io.py | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 3683c87d..d0729332 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1387,7 +1387,7 @@ def ogr_read_info( dataset_kwargs, object layer=None, object encoding=None, - int force_featurecount=False, + int force_feature_count=False, int force_total_bounds=False): cdef const char *path_c = NULL @@ -1423,7 +1423,7 @@ def ogr_read_info( 'fields': fields[:,2], # return only names 'dtypes': fields[:,3], 'geometry_type': get_geometry_type(ogr_layer), - 'features': get_feature_count(ogr_layer, force_featurecount), + 'features': get_feature_count(ogr_layer, force_feature_count), 'total_bounds': get_total_bounds(ogr_layer, force_total_bounds), 'driver': get_driver(ogr_dataset), "capabilities": { diff --git a/pyogrio/core.py b/pyogrio/core.py index be5b12db..99bb0269 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -153,7 +153,7 @@ def read_info( /, layer=None, encoding=None, - force_featurecount=False, + force_feature_count=False, force_total_bounds=False, **kwargs, ): @@ -163,7 +163,7 @@ def read_info( 0 for a nonspatial layer. ``features`` will be -1 if this is an expensive operation for this driver. You can - force it to be calculated using the ``force_featurecount`` parameter. + force it to be calculated using the ``force_feature_count`` parameter. ``total_bounds`` is the 2-dimensional extent of all features within the dataset: (xmin, ymin, xmax, ymax). It will be None if this operation is expensive for this @@ -179,7 +179,7 @@ def read_info( If present, will be used as the encoding for reading string values from the data source, unless encoding can be inferred directly from the data source. - force_featurecount : bool, optional (default: False) + force_feature_count : bool, optional (default: False) True if the feature count should be computed even if it is expensive. force_total_bounds : bool, optional (default: False) True if the total bounds should be computed even if it is expensive. @@ -214,7 +214,7 @@ def read_info( path, layer=layer, encoding=encoding, - force_featurecount=force_featurecount, + force_feature_count=force_feature_count, force_total_bounds=force_total_bounds, dataset_kwargs=dataset_kwargs, ) diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index d5092ad0..28ff3120 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -270,32 +270,32 @@ def test_read_info_invalid_dataset_kwargs(naturalearth_lowres): def test_read_info_force_feature_count_exception(data_dir): with pytest.raises(DataLayerError, match="Could not iterate over features"): - read_info(data_dir / "sample.osm.pbf", layer="lines", force_featurecount=True) + read_info(data_dir / "sample.osm.pbf", layer="lines", force_feature_count=True) @pytest.mark.parametrize( - "force_featurecount, exp_featurecount, exp_featurecount_lines", + "force_feature_count, exp_feature_count, exp_feature_count_lines", [(True, 8, 36), (False, -1, -1)], ) def test_read_info_force_feature_count( - data_dir, force_featurecount, exp_featurecount, exp_featurecount_lines + data_dir, force_feature_count, exp_feature_count, exp_feature_count_lines ): # the sample OSM file has non-increasing node IDs which causes the default # custom indexing to raise an exception iterating over features meta = read_info( data_dir / "sample.osm.pbf", - force_featurecount=force_featurecount, + force_feature_count=force_feature_count, USE_CUSTOM_INDEXING=False, ) - assert meta["features"] == exp_featurecount + assert meta["features"] == exp_feature_count meta = read_info( data_dir / "sample.osm.pbf", layer="lines", - force_featurecount=force_featurecount, + force_feature_count=force_feature_count, USE_CUSTOM_INDEXING=False, ) - assert meta["features"] == exp_featurecount_lines + assert meta["features"] == exp_feature_count_lines @pytest.mark.parametrize( diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 55f27b17..6f6f9660 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -472,7 +472,7 @@ def test_write_append_unsupported(tmpdir, naturalearth_lowres, driver, ext): assert os.path.exists(filename) - assert read_info(filename, force_featurecount=True)["features"] == 177 + assert read_info(filename, force_feature_count=True)["features"] == 177 with pytest.raises(DataSourceError): write(filename, geometry, field_data, driver=driver, append=True, **meta) From b5564ac7813f9fec73e3db86d4950e9e2b832ea3 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 6 Sep 2023 00:04:13 +0200 Subject: [PATCH 09/17] Update doc --- pyogrio/_io.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index d0729332..98edf29c 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -389,7 +389,7 @@ cdef get_total_bounds(OGRLayerH ogr_layer, int force): Returns ------- - Optional[Tuple[float, float, float, float]] + tuple of (xmin, ymin, xmax, ymax) or None The total bounds of the layer, or None if they could not be determined. """ From 7cbca96f51be18a30966a4bbb35cb0f9fee498e4 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 6 Sep 2023 00:12:41 +0200 Subject: [PATCH 10/17] Use allclose to assert total_bounds --- pyogrio/tests/test_core.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index 28ff3120..0f33243f 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -227,7 +227,7 @@ def test_read_info(naturalearth_lowres): assert meta["fields"].shape == (5,) assert meta["dtypes"].tolist() == ["int64", "object", "object", "object", "float64"] assert meta["features"] == 177 - assert meta["total_bounds"] == (-180, -90, 180.00000000000006, 83.64513000000001) + assert allclose(meta["total_bounds"], (-180, -90, 180, 83.64513)) assert meta["driver"] == "ESRI Shapefile" @@ -307,10 +307,11 @@ def test_read_info_force_total_bounds( ): # Geojson files don't hava a fast way to determine total_bounds geojson_path = prepare_testfile(naturalearth_lowres, dst_dir=tmpdir, ext=".geojson") - assert ( - read_info(geojson_path, force_total_bounds=force_total_bounds)["total_bounds"] - == expected_total_bounds - ) + info = read_info(geojson_path, force_total_bounds=force_total_bounds) + if expected_total_bounds is not None: + assert allclose(info["total_bounds"], expected_total_bounds) + else: + assert info["total_bounds"] is None def test_read_info_without_geometry(test_fgdb_vsi): From 0ccda6d96a34d8d90252d6626b1e27b04283bbc9 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 6 Sep 2023 00:16:06 +0200 Subject: [PATCH 11/17] Parametrize test_read_info_force_feature_count --- pyogrio/tests/test_core.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index 0f33243f..cd93372e 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -274,28 +274,24 @@ def test_read_info_force_feature_count_exception(data_dir): @pytest.mark.parametrize( - "force_feature_count, exp_feature_count, exp_feature_count_lines", - [(True, 8, 36), (False, -1, -1)], + "layer, force, expected", + [ + ("points", False, -1), + ("points", True, 8), + ("lines", False, -1), + ("lines", True, 36), + ], ) -def test_read_info_force_feature_count( - data_dir, force_feature_count, exp_feature_count, exp_feature_count_lines -): +def test_read_info_force_feature_count(data_dir, layer, force, expected): # the sample OSM file has non-increasing node IDs which causes the default # custom indexing to raise an exception iterating over features meta = read_info( data_dir / "sample.osm.pbf", - force_feature_count=force_feature_count, - USE_CUSTOM_INDEXING=False, - ) - assert meta["features"] == exp_feature_count - - meta = read_info( - data_dir / "sample.osm.pbf", - layer="lines", - force_feature_count=force_feature_count, + layer=layer, + force_feature_count=force, USE_CUSTOM_INDEXING=False, ) - assert meta["features"] == exp_feature_count_lines + assert meta["features"] == expected @pytest.mark.parametrize( From 66390f2d0ee558be103d203a1d0d523a8dc98db0 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 6 Sep 2023 00:33:32 +0200 Subject: [PATCH 12/17] Add the relevant capabilities bools in read_info --- pyogrio/_io.pyx | 8 +++++--- pyogrio/_ogr.pxd | 2 ++ pyogrio/tests/test_core.py | 5 +++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 98edf29c..9dec06c6 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1427,9 +1427,11 @@ def ogr_read_info( 'total_bounds': get_total_bounds(ogr_layer, force_total_bounds), 'driver': get_driver(ogr_dataset), "capabilities": { - "random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead), - "fast_set_next_by_index": OGR_L_TestCapability(ogr_layer, OLCFastSetNextByIndex), - "fast_spatial_filter": OGR_L_TestCapability(ogr_layer, OLCFastSpatialFilter), + "random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead) == 1, + "fast_set_next_by_index": OGR_L_TestCapability(ogr_layer, OLCFastSetNextByIndex) == 1, + "fast_spatial_filter": OGR_L_TestCapability(ogr_layer, OLCFastSpatialFilter) == 1, + "fast_feature_count": OGR_L_TestCapability(ogr_layer, OLCFastFeatureCount) == 1, + "fast_total_bounds": OGR_L_TestCapability(ogr_layer, OLCFastGetExtent) == 1, }, 'layer_metadata': get_metadata(ogr_layer), 'dataset_metadata': get_metadata(ogr_dataset), diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index d5dc572e..38f777ff 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -302,6 +302,8 @@ cdef extern from "ogr_api.h": const char* OLCRandomRead const char* OLCFastSetNextByIndex const char* OLCFastSpatialFilter + const char* OLCFastFeatureCount + const char* OLCFastGetExtent const char* OLCTransactions diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index cd93372e..928f7f22 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -229,6 +229,11 @@ def test_read_info(naturalearth_lowres): assert meta["features"] == 177 assert allclose(meta["total_bounds"], (-180, -90, 180, 83.64513)) assert meta["driver"] == "ESRI Shapefile" + assert meta["capabilities"]["random_read"] is True + assert meta["capabilities"]["fast_set_next_by_index"] is True + assert meta["capabilities"]["fast_spatial_filter"] is False + assert meta["capabilities"]["fast_feature_count"] is True + assert meta["capabilities"]["fast_total_bounds"] is True @pytest.mark.parametrize( From 6a580a71c785f1835679707a4c952ba006c8f037 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 6 Sep 2023 00:49:40 +0200 Subject: [PATCH 13/17] Update CHANGES.md --- CHANGES.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 86156860..4517492d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,7 +10,8 @@ unknown count for a data layer (e.g., OSM driver); this may have signficant performance impacts for some data sources that would otherwise return an unknown count (count is used in `read_info`, `read`, `read_dataframe`) (#271). -- Add `total_bounds` of layer to `read_info` (#281) +- In `read_info`, the result now also contains the `total_bounds` of the layer as well + as some extra `capabilities` of the data source driver (#281) ### Bug fixes @@ -20,6 +21,15 @@ - Fix errors reading OSM data due to invalid feature count and incorrect reading of OSM layers beyond the first layer (#271) +### Potentially breaking changes + +- In `read_info` (#281): + - the `features` property in the result will now be -1 if calculating the + feature count is an expensive operation for this driver. You can force it to be + calculated using the `force_feature_count` parameter. + - for the keys in the `capabilities` property, the values will now be booleans + instead of 1 or 0. + ## 0.6.0 (2023-04-27) ### Improvements From f6d4656a9dac8a0596ebae440740ec536a1ce551 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 6 Sep 2023 00:50:19 +0200 Subject: [PATCH 14/17] Add read_info capabilities to doc --- pyogrio/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyogrio/core.py b/pyogrio/core.py index 99bb0269..70a11dfd 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -201,8 +201,9 @@ def read_info( "features": , "total_bounds": , "driver": "", - "dataset_metadata" "" - "layer_metadata" "" + "capabilities": "" + "dataset_metadata": "" + "layer_metadata": "" } """ path, buffer = get_vsi_path(path_or_buffer) From 38a87cb41ff31dfc32ca877d5fdb5b41092821da Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 6 Sep 2023 00:57:37 +0200 Subject: [PATCH 15/17] Update core.py --- pyogrio/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/core.py b/pyogrio/core.py index 70a11dfd..baa920b0 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -166,7 +166,7 @@ def read_info( force it to be calculated using the ``force_feature_count`` parameter. ``total_bounds`` is the 2-dimensional extent of all features within the dataset: - (xmin, ymin, xmax, ymax). It will be None if this operation is expensive for this + (xmin, ymin, xmax, ymax). It will be None if this is an expensive operation for this driver or if the data source is nonspatial. You can force it to be calculated using the ``force_total_bounds`` parameter. From cf0dfb2c5036e9712afc66b4cbe03712372640df Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 6 Sep 2023 08:02:35 +0200 Subject: [PATCH 16/17] Resolve feedback --- CHANGES.md | 4 ++-- pyogrio/_io.pyx | 15 +++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 4517492d..962aa7bd 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -27,8 +27,8 @@ - the `features` property in the result will now be -1 if calculating the feature count is an expensive operation for this driver. You can force it to be calculated using the `force_feature_count` parameter. - - for the keys in the `capabilities` property, the values will now be booleans - instead of 1 or 0. + - for boolean values in the `capabilities` property, the values will now be + booleans instead of 1 or 0. ## 0.6.0 (2023-04-27) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 9dec06c6..0732214e 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -393,12 +393,19 @@ cdef get_total_bounds(OGRLayerH ogr_layer, int force): The total bounds of the layer, or None if they could not be determined. """ - cdef OGREnvelope ogr_envelope # = NULL + cdef OGREnvelope ogr_envelope = NULL try: exc_wrap_ogrerr(OGR_L_GetExtent(ogr_layer, &ogr_envelope, force)) - bounds = ( - ogr_envelope.MinX, ogr_envelope.MinY, ogr_envelope.MaxX, ogr_envelope.MaxY - ) + if ogr_envelope == NULL: + bounds = None + else: + bounds = ( + ogr_envelope.MinX, + ogr_envelope.MinY, + ogr_envelope.MaxX, + ogr_envelope.MaxY, + ) + except CPLE_BaseError: bounds = None From 55936b8bc883f6c42074a1409cce9aaa35108e4a Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 6 Sep 2023 08:14:09 +0200 Subject: [PATCH 17/17] Rollback change to ogr_envelope checking a struct for NULL is not possible as it isn't a pointer --- pyogrio/_io.pyx | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 0732214e..9a9bc8b7 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -393,18 +393,12 @@ cdef get_total_bounds(OGRLayerH ogr_layer, int force): The total bounds of the layer, or None if they could not be determined. """ - cdef OGREnvelope ogr_envelope = NULL + cdef OGREnvelope ogr_envelope try: exc_wrap_ogrerr(OGR_L_GetExtent(ogr_layer, &ogr_envelope, force)) - if ogr_envelope == NULL: - bounds = None - else: - bounds = ( - ogr_envelope.MinX, - ogr_envelope.MinY, - ogr_envelope.MaxX, - ogr_envelope.MaxY, - ) + bounds = ( + ogr_envelope.MinX, ogr_envelope.MinY, ogr_envelope.MaxX, ogr_envelope.MaxY + ) except CPLE_BaseError: bounds = None