Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add total_bounds of layer to read_info #281

Merged
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
unknown count for a data layer (e.g., OSM driver); this may have signficant
performance impacts for some data sources that would otherwise return an
unknown count (count is used in `read_info`, `read`, `read_dataframe`) (#271).
- In `read_info`, the result now also contains the `total_bounds` of the layer as well
as some extra `capabilities` of the data source driver (#281)

### Bug fixes

Expand All @@ -19,6 +21,15 @@
- Fix errors reading OSM data due to invalid feature count and incorrect
reading of OSM layers beyond the first layer (#271)

### Potentially breaking changes

- In `read_info` (#281):
- the `features` property in the result will now be -1 if calculating the
feature count is an expensive operation for this driver. You can force it to be
calculated using the `force_feature_count` parameter.
- for the keys in the `capabilities` property, the values will now be booleans
theroggy marked this conversation as resolved.
Show resolved Hide resolved
instead of 1 or 0.

## 0.6.0 (2023-04-27)

### Improvements
Expand Down
52 changes: 43 additions & 9 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ cdef get_driver(OGRDataSourceH ogr_dataset):
return driver


cdef get_feature_count(OGRLayerH ogr_layer):
cdef get_feature_count(OGRLayerH ogr_layer, int force):
"""Get the feature count of a layer.

If GDAL returns an unknown count (-1), this iterates over every feature
Expand All @@ -329,6 +329,8 @@ cdef get_feature_count(OGRLayerH ogr_layer):
Parameters
----------
ogr_layer : pointer to open OGR layer
force : bool
True if the feature count should be computed even if it is expensive

Returns
-------
Expand All @@ -337,12 +339,12 @@ cdef get_feature_count(OGRLayerH ogr_layer):
"""

cdef OGRFeatureH ogr_feature = NULL
cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, 1)
cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, force)

# if GDAL refuses to give us the feature count, we have to loop over all
# features ourselves and get the count. This can happen for some drivers
# (e.g., OSM) or if a where clause is invalid but not rejected as error
if feature_count == -1:
if force and feature_count == -1:
# make sure layer is read from beginning
OGR_L_ResetReading(ogr_layer)

Expand Down Expand Up @@ -376,6 +378,33 @@ cdef get_feature_count(OGRLayerH ogr_layer):
return feature_count


cdef get_total_bounds(OGRLayerH ogr_layer, int force):
"""Get the total bounds of a layer.

Parameters
----------
ogr_layer : pointer to open OGR layer
force : bool
True if the total bounds should be computed even if it is expensive

Returns
-------
tuple of (xmin, ymin, xmax, ymax) or None
The total bounds of the layer, or None if they could not be determined.
"""

cdef OGREnvelope ogr_envelope # = NULL
theroggy marked this conversation as resolved.
Show resolved Hide resolved
try:
exc_wrap_ogrerr(OGR_L_GetExtent(ogr_layer, &ogr_envelope, force))
bounds = (
theroggy marked this conversation as resolved.
Show resolved Hide resolved
ogr_envelope.MinX, ogr_envelope.MinY, ogr_envelope.MaxX, ogr_envelope.MaxY
)
except CPLE_BaseError:
bounds = None

return bounds


cdef set_metadata(GDALMajorObjectH obj, object metadata):
"""Set metadata on a dataset or layer

Expand Down Expand Up @@ -598,7 +627,7 @@ cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_fe
skip_features : number of features to skip from beginning of available range
max_features : maximum number of features to read from available range
"""
feature_count = get_feature_count(ogr_layer)
feature_count = get_feature_count(ogr_layer, 1)
num_features = max_features

if feature_count == 0:
Expand Down Expand Up @@ -1357,7 +1386,9 @@ def ogr_read_info(
str path,
dataset_kwargs,
object layer=None,
object encoding=None):
object encoding=None,
int force_feature_count=False,
int force_total_bounds=False):

cdef const char *path_c = NULL
cdef char **dataset_options = NULL
Expand Down Expand Up @@ -1392,12 +1423,15 @@ def ogr_read_info(
'fields': fields[:,2], # return only names
'dtypes': fields[:,3],
'geometry_type': get_geometry_type(ogr_layer),
'features': get_feature_count(ogr_layer),
'features': get_feature_count(ogr_layer, force_feature_count),
'total_bounds': get_total_bounds(ogr_layer, force_total_bounds),
'driver': get_driver(ogr_dataset),
"capabilities": {
"random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead),
"fast_set_next_by_index": OGR_L_TestCapability(ogr_layer, OLCFastSetNextByIndex),
"fast_spatial_filter": OGR_L_TestCapability(ogr_layer, OLCFastSpatialFilter),
"random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead) == 1,
"fast_set_next_by_index": OGR_L_TestCapability(ogr_layer, OLCFastSetNextByIndex) == 1,
"fast_spatial_filter": OGR_L_TestCapability(ogr_layer, OLCFastSpatialFilter) == 1,
"fast_feature_count": OGR_L_TestCapability(ogr_layer, OLCFastFeatureCount) == 1,
"fast_total_bounds": OGR_L_TestCapability(ogr_layer, OLCFastGetExtent) == 1,
},
'layer_metadata': get_metadata(ogr_layer),
'dataset_metadata': get_metadata(ogr_dataset),
Expand Down
3 changes: 3 additions & 0 deletions pyogrio/_ogr.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ cdef extern from "ogr_api.h":
const char* OGR_L_GetName(OGRLayerH layer)
const char* OGR_L_GetFIDColumn(OGRLayerH layer)
const char* OGR_L_GetGeometryColumn(OGRLayerH layer)
OGRErr OGR_L_GetExtent(OGRLayerH layer, OGREnvelope *psExtent, int bForce)
OGRSpatialReferenceH OGR_L_GetSpatialRef(OGRLayerH layer)
int OGR_L_TestCapability(OGRLayerH layer, const char *name)
OGRFeatureDefnH OGR_L_GetLayerDefn(OGRLayerH layer)
Expand All @@ -301,6 +302,8 @@ cdef extern from "ogr_api.h":
const char* OLCRandomRead
const char* OLCFastSetNextByIndex
const char* OLCFastSpatialFilter
const char* OLCFastFeatureCount
const char* OLCFastGetExtent
const char* OLCTransactions


Expand Down
41 changes: 34 additions & 7 deletions pyogrio/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,27 @@ def read_bounds(
return result


def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs):
def read_info(
path_or_buffer,
/,
layer=None,
encoding=None,
force_feature_count=False,
force_total_bounds=False,
**kwargs,
):
"""Read information about an OGR data source.

``crs`` and ``geometry`` will be ``None`` and ``features`` will be 0 for a
nonspatial layer.
``crs``, ``geometry`` and ``total_bounds`` will be ``None`` and ``features`` will be
0 for a nonspatial layer.

``features`` will be -1 if this is an expensive operation for this driver. You can
force it to be calculated using the ``force_feature_count`` parameter.

``total_bounds`` is the 2-dimensional extent of all features within the dataset:
(xmin, ymin, xmax, ymax). It will be None if this is an expensive operation for this
driver or if the data source is nonspatial. You can force it to be calculated using
the ``force_total_bounds`` parameter.

Parameters
----------
Expand All @@ -163,6 +179,10 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs):
If present, will be used as the encoding for reading string values from
the data source, unless encoding can be inferred directly from the data
source.
force_feature_count : bool, optional (default: False)
True if the feature count should be computed even if it is expensive.
force_total_bounds : bool, optional (default: False)
True if the total bounds should be computed even if it is expensive.
**kwargs
Additional driver-specific dataset open options passed to OGR. Invalid
options will trigger a warning.
Expand All @@ -178,10 +198,12 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs):
"dtypes": <ndarray of field dtypes>,
"encoding": "<encoding>",
"geometry": "<geometry type>",
"features": <feature count>,
"features": <feature count or -1>,
"total_bounds": <tuple with total bounds or None>,
"driver": "<driver>",
"dataset_metadata" "<dict of dataset metadata or None>"
"layer_metadata" "<dict of layer metadata or None>"
"capabilities": "<dict of driver capabilities>"
"dataset_metadata": "<dict of dataset metadata or None>"
"layer_metadata": "<dict of layer metadata or None>"
}
"""
path, buffer = get_vsi_path(path_or_buffer)
Expand All @@ -190,7 +212,12 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs):

try:
result = ogr_read_info(
path, layer=layer, encoding=encoding, dataset_kwargs=dataset_kwargs
path,
layer=layer,
encoding=encoding,
force_feature_count=force_feature_count,
force_total_bounds=force_total_bounds,
dataset_kwargs=dataset_kwargs,
)
finally:
if buffer is not None:
Expand Down
50 changes: 43 additions & 7 deletions pyogrio/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
get_gdal_config_option,
get_gdal_data_path,
)
from pyogrio.tests.conftest import prepare_testfile
from pyogrio.errors import DataSourceError, DataLayerError

from pyogrio._env import GDALEnv
Expand Down Expand Up @@ -226,7 +227,13 @@ def test_read_info(naturalearth_lowres):
assert meta["fields"].shape == (5,)
assert meta["dtypes"].tolist() == ["int64", "object", "object", "object", "float64"]
assert meta["features"] == 177
assert allclose(meta["total_bounds"], (-180, -90, 180, 83.64513))
assert meta["driver"] == "ESRI Shapefile"
assert meta["capabilities"]["random_read"] is True
assert meta["capabilities"]["fast_set_next_by_index"] is True
assert meta["capabilities"]["fast_spatial_filter"] is False
assert meta["capabilities"]["fast_feature_count"] is True
assert meta["capabilities"]["fast_total_bounds"] is True


@pytest.mark.parametrize(
Expand Down Expand Up @@ -268,19 +275,48 @@ def test_read_info_invalid_dataset_kwargs(naturalearth_lowres):

def test_read_info_force_feature_count_exception(data_dir):
with pytest.raises(DataLayerError, match="Could not iterate over features"):
read_info(data_dir / "sample.osm.pbf", layer="lines")
read_info(data_dir / "sample.osm.pbf", layer="lines", force_feature_count=True)


def test_read_info_force_feature_count(data_dir):
@pytest.mark.parametrize(
theroggy marked this conversation as resolved.
Show resolved Hide resolved
"layer, force, expected",
[
("points", False, -1),
("points", True, 8),
("lines", False, -1),
("lines", True, 36),
],
)
def test_read_info_force_feature_count(data_dir, layer, force, expected):
# the sample OSM file has non-increasing node IDs which causes the default
# custom indexing to raise an exception iterating over features
meta = read_info(data_dir / "sample.osm.pbf", USE_CUSTOM_INDEXING=False)
assert meta["features"] == 8

meta = read_info(
data_dir / "sample.osm.pbf", layer="lines", USE_CUSTOM_INDEXING=False
data_dir / "sample.osm.pbf",
layer=layer,
force_feature_count=force,
USE_CUSTOM_INDEXING=False,
)
assert meta["features"] == 36
assert meta["features"] == expected


@pytest.mark.parametrize(
"force_total_bounds, expected_total_bounds",
[(True, (-180.0, -90.0, 180.0, 83.64513)), (False, None)],
)
def test_read_info_force_total_bounds(
tmpdir, naturalearth_lowres, force_total_bounds, expected_total_bounds
):
# Geojson files don't hava a fast way to determine total_bounds
geojson_path = prepare_testfile(naturalearth_lowres, dst_dir=tmpdir, ext=".geojson")
info = read_info(geojson_path, force_total_bounds=force_total_bounds)
if expected_total_bounds is not None:
assert allclose(info["total_bounds"], expected_total_bounds)
else:
assert info["total_bounds"] is None


def test_read_info_without_geometry(test_fgdb_vsi):
assert read_info(test_fgdb_vsi)["total_bounds"] is None


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ def test_write_append_unsupported(tmpdir, naturalearth_lowres, driver, ext):

assert os.path.exists(filename)

assert read_info(filename)["features"] == 177
assert read_info(filename, force_feature_count=True)["features"] == 177

with pytest.raises(DataSourceError):
write(filename, geometry, field_data, driver=driver, append=True, **meta)
Expand Down
Loading