diff --git a/CHANGES.md b/CHANGES.md index fff5a9a4..e614d343 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,11 +6,18 @@ - Support writing dataframes without geometry column (#267) +- Calculate feature count by iterating over features if GDAL returns an + unknown count for a data layer (e.g., OSM driver); this may have signficant + performance impacts for some data sources that would otherwise return an + unknown count (count is used in `read_info`, `read`, `read_dataframe`) (#271). + ### Bug fixes - Fix int32 overflow when reading int64 columns (#260) - Fix `fid_as_index=True` doesn't set fid as index using `read_dataframe` with `use_arrow=True` (#265) +- Fix errors reading OSM data due to invalid feature count and incorrect + reading of OSM layers beyond the first layer (#271) ## 0.6.0 (2023-04-27) diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md index 0b27a9e5..a35bbe1d 100644 --- a/docs/source/known_issues.md +++ b/docs/source/known_issues.md @@ -65,3 +65,45 @@ convert the datetimes to string. Timezone information is ignored at the moment, both when reading and when writing datetime columns. + +## Support for OpenStreetMap (OSM) data + +OpenStreetMap data do not natively support calculating the feature count by data +layer due to the internal data structures. To get around this, Pyogrio iterates +over all features first to calculate the feature count that is used to allocate +arrays that contain the geometries and attributes read from the data layer, and +then iterates over all feature again to populate those arrays. Further, data +within the file are not structured at the top level to support fast reading by +layer, which means that reading data by layer may need to read all records +within the data source, not just those belonging to a particular layer. This is +inefficient and slow, and is exacerbated when attemping to read from +remotely-hosted data sources rather than local files. + +You may also be instructed by GDAL to enable interleaved reading mode via an +error message when you try to read a large file without it, which you can do in +one of two ways: + +1. Set config option used for all operations + +```python +from pyogrio import set_gdal_config_options + +set_gdal_config_options({"OGR_INTERLEAVED_READING": True}) +``` + +2. Set dataset open option + +```python + +from pyogrio import read_dataframe + +df = read_dataframe(path, INTERLEAVED_READING=True) +``` + +We recommend the following to sidestep performance issues: + +- download remote OSM data sources to local files before attempting + to read +- the `use_arrow=True` option may speed up reading from OSM files +- if possible, use a different tool such as `ogr2ogr` to translate the OSM + data source into a more performant format for reading by layer, such as GPKG diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 56006990..2540961e 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -195,6 +195,17 @@ cdef OGRLayerH get_ogr_layer(GDALDatasetH ogr_dataset, layer) except NULL: except CPLE_BaseError as exc: raise DataLayerError(str(exc)) + # if the driver is OSM, we need to execute SQL to set the layer to read in + # order to read it properly + if get_driver(ogr_dataset) == "OSM": + # Note: this returns NULL and does not need to be freed via + # GDALDatasetReleaseResultSet() + layer_name = get_string(OGR_L_GetName(ogr_layer)) + sql_b = f"SET interest_layers = {layer_name}".encode('utf-8') + sql_c = sql_b + + GDALDatasetExecuteSQL(ogr_dataset, sql_c, NULL, NULL) + return ogr_layer @@ -309,6 +320,62 @@ cdef get_driver(OGRDataSourceH ogr_dataset): return driver +cdef get_feature_count(OGRLayerH ogr_layer): + """Get the feature count of a layer. + + If GDAL returns an unknown count (-1), this iterates over every feature + to calculate the count. + + Parameters + ---------- + ogr_layer : pointer to open OGR layer + + Returns + ------- + int + count of features + """ + + cdef OGRFeatureH ogr_feature = NULL + cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, 1) + + # if GDAL refuses to give us the feature count, we have to loop over all + # features ourselves and get the count. This can happen for some drivers + # (e.g., OSM) or if a where clause is invalid but not rejected as error + if feature_count == -1: + # make sure layer is read from beginning + OGR_L_ResetReading(ogr_layer) + + feature_count = 0 + while True: + try: + ogr_feature = exc_wrap_pointer(OGR_L_GetNextFeature(ogr_layer)) + feature_count +=1 + + except NullPointerError: + # No more rows available, so stop reading + break + + # driver may raise other errors, e.g., for OSM if node ids are not + # increasing, the default config option OSM_USE_CUSTOM_INDEXING=YES + # causes errors iterating over features + except CPLE_BaseError as exc: + # if an invalid where clause is used for a GPKG file, it is not + # caught as an error until attempting to iterate over features; + # catch it here + if "failed to prepare SQL" in str(exc): + raise ValueError(f"Invalid SQL query: {str(exc)}") from None + + raise DataLayerError(f"Could not iterate over features: {str(exc)}") from None + + finally: + if ogr_feature != NULL: + OGR_F_Destroy(ogr_feature) + ogr_feature = NULL + + return feature_count + + cdef set_metadata(GDALMajorObjectH obj, object metadata): """Set metadata on a dataset or layer @@ -371,6 +438,7 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer): ------- str or None """ + if OGR_L_TestCapability(ogr_layer, OLCStringsAsUTF8): return 'UTF-8' @@ -378,6 +446,11 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer): if driver == 'ESRI Shapefile': return 'ISO-8859-1' + if driver == "OSM": + # always set OSM data to UTF-8 + # per https://help.openstreetmap.org/questions/2172/what-encoding-does-openstreetmap-use + return "UTF-8" + return None @@ -486,11 +559,10 @@ cdef apply_where_filter(OGRLayerH ogr_layer, str where): if err != OGRERR_NONE: try: exc_check() - name = OGR_L_GetName(ogr_layer) except CPLE_BaseError as exc: raise ValueError(str(exc)) - raise ValueError(f"Invalid SQL query for layer '{name}': '{where}'") + raise ValueError(f"Invalid SQL query for layer '{OGR_L_GetName(ogr_layer)}': '{where}'") cdef apply_spatial_filter(OGRLayerH ogr_layer, bbox): @@ -526,11 +598,10 @@ cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_fe skip_features : number of features to skip from beginning of available range max_features : maximum number of features to read from available range """ - feature_count = OGR_L_GetFeatureCount(ogr_layer, 1) + feature_count = get_feature_count(ogr_layer) num_features = max_features - if feature_count <= 0: - # the count comes back as -1 if the where clause above is invalid but not rejected as error + if feature_count == 0: name = OGR_L_GetName(ogr_layer) warnings.warn(f"Layer '{name}' does not have any features to read") return 0, 0 @@ -736,9 +807,6 @@ cdef get_features( break except CPLE_BaseError as exc: - if "failed to prepare SQL" in str(exc): - raise ValueError(f"Invalid SQL query") from exc - raise FeatureError(str(exc)) if i >= num_features: @@ -886,10 +954,7 @@ cdef get_bounds( break except CPLE_BaseError as exc: - if "failed to prepare SQL" in str(exc): - raise ValueError(f"Invalid SQL query") from exc - else: - raise FeatureError(str(exc)) + raise FeatureError(str(exc)) if i >= num_features: raise FeatureError( @@ -1327,7 +1392,7 @@ def ogr_read_info( 'fields': fields[:,2], # return only names 'dtypes': fields[:,3], 'geometry_type': get_geometry_type(ogr_layer), - 'features': OGR_L_GetFeatureCount(ogr_layer, 1), + 'features': get_feature_count(ogr_layer), 'driver': get_driver(ogr_dataset), "capabilities": { "random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead), diff --git a/pyogrio/tests/fixtures/README.md b/pyogrio/tests/fixtures/README.md index f9bdede8..89ef7937 100644 --- a/pyogrio/tests/fixtures/README.md +++ b/pyogrio/tests/fixtures/README.md @@ -83,3 +83,7 @@ This was extracted from https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrogra ```bash ogr2ogr test_mixed_surface.gpkg NHDPLUS_H_0308_HU4_GDB.gdb NHDWaterbody -where '"NHDPlusID" = 15000300070477' -select "NHDPlusID" ``` + +### OSM PBF test + +This was downloaded from https://github.com/openstreetmap/OSM-binary/blob/master/resources/sample.pbf diff --git a/pyogrio/tests/fixtures/sample.osm.pbf b/pyogrio/tests/fixtures/sample.osm.pbf new file mode 100644 index 00000000..8a22edfe Binary files /dev/null and b/pyogrio/tests/fixtures/sample.osm.pbf differ diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index 13292261..538d6512 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -12,7 +12,7 @@ get_gdal_config_option, get_gdal_data_path, ) -from pyogrio.errors import DataSourceError +from pyogrio.errors import DataSourceError, DataLayerError from pyogrio._env import GDALEnv @@ -266,6 +266,23 @@ def test_read_info_invalid_dataset_kwargs(naturalearth_lowres): read_info(naturalearth_lowres, INVALID="YES") +def test_read_info_force_feature_count_exception(data_dir): + with pytest.raises(DataLayerError, match="Could not iterate over features"): + read_info(data_dir / "sample.osm.pbf", layer="lines") + + +def test_read_info_force_feature_count(data_dir): + # the sample OSM file has non-increasing node IDs which causes the default + # custom indexing to raise an exception iterating over features + meta = read_info(data_dir / "sample.osm.pbf", USE_CUSTOM_INDEXING=False) + assert meta["features"] == 8 + + meta = read_info( + data_dir / "sample.osm.pbf", layer="lines", USE_CUSTOM_INDEXING=False + ) + assert meta["features"] == 36 + + @pytest.mark.parametrize( "name,value,expected", [