ENH: Calculate feature count ourselves if GDAL returns unknown count …

…(-1), better support OSM data (#271)
geopandas · Aug 22, 2023 · 94c2bb4 · 94c2bb4
1 parent a847c27
commit 94c2bb4
Show file tree

Hide file tree

Showing 6 changed files with 149 additions and 14 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -6,11 +6,18 @@
 
 - Support writing dataframes without geometry column (#267)
 
+- Calculate feature count by iterating over features if GDAL returns an
+ unknown count for a data layer (e.g., OSM driver); this may have signficant
+ performance impacts for some data sources that would otherwise return an
+ unknown count (count is used in `read_info`, `read`, `read_dataframe`) (#271).
+
 ### Bug fixes
 
 - Fix int32 overflow when reading int64 columns (#260)
 - Fix `fid_as_index=True` doesn't set fid as index using `read_dataframe` with
  `use_arrow=True` (#265)
+- Fix errors reading OSM data due to invalid feature count and incorrect
+ reading of OSM layers beyond the first layer (#271)
 
 ## 0.6.0 (2023-04-27)
 

diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md
@@ -65,3 +65,45 @@ convert the datetimes to string.
 
 Timezone information is ignored at the moment, both when reading and when writing
 datetime columns.
+
+## Support for OpenStreetMap (OSM) data
+
+OpenStreetMap data do not natively support calculating the feature count by data
+layer due to the internal data structures. To get around this, Pyogrio iterates
+over all features first to calculate the feature count that is used to allocate
+arrays that contain the geometries and attributes read from the data layer, and
+then iterates over all feature again to populate those arrays. Further, data
+within the file are not structured at the top level to support fast reading by
+layer, which means that reading data by layer may need to read all records
+within the data source, not just those belonging to a particular layer. This is
+inefficient and slow, and is exacerbated when attemping to read from
+remotely-hosted data sources rather than local files.
+
+You may also be instructed by GDAL to enable interleaved reading mode via an
+error message when you try to read a large file without it, which you can do in
+one of two ways:
+
+1. Set config option used for all operations
+
+```python
+from pyogrio import set_gdal_config_options
+
+set_gdal_config_options({"OGR_INTERLEAVED_READING": True})
+```
+
+2. Set dataset open option
+
+```python
+
+from pyogrio import read_dataframe
+
+df = read_dataframe(path, INTERLEAVED_READING=True)
+```
+
+We recommend the following to sidestep performance issues:
+
+- download remote OSM data sources to local files before attempting
+ to read
+- the `use_arrow=True` option may speed up reading from OSM files
+- if possible, use a different tool such as `ogr2ogr` to translate the OSM
+ data source into a more performant format for reading by layer, such as GPKG
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -195,6 +195,17 @@ cdef OGRLayerH get_ogr_layer(GDALDatasetH ogr_dataset, layer) except NULL:
  except CPLE_BaseError as exc:
  raise DataLayerError(str(exc))
 
+ # if the driver is OSM, we need to execute SQL to set the layer to read in
+ # order to read it properly
+ if get_driver(ogr_dataset) == "OSM":
+ # Note: this returns NULL and does not need to be freed via
+ # GDALDatasetReleaseResultSet()
+ layer_name = get_string(OGR_L_GetName(ogr_layer))
+ sql_b = f"SET interest_layers = {layer_name}".encode('utf-8')
+ sql_c = sql_b
+
+ GDALDatasetExecuteSQL(ogr_dataset, sql_c, NULL, NULL)
+
  return ogr_layer
 
 
@@ -309,6 +320,62 @@ cdef get_driver(OGRDataSourceH ogr_dataset):
  return driver
 
 
+cdef get_feature_count(OGRLayerH ogr_layer):
+ """Get the feature count of a layer.
+
+ If GDAL returns an unknown count (-1), this iterates over every feature
+ to calculate the count.
+
+ Parameters
+ ----------
+ ogr_layer : pointer to open OGR layer
+
+ Returns
+ -------
+ int
+ count of features
+ """
+
+ cdef OGRFeatureH ogr_feature = NULL
+ cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, 1)
+
+ # if GDAL refuses to give us the feature count, we have to loop over all
+ # features ourselves and get the count. This can happen for some drivers
+ # (e.g., OSM) or if a where clause is invalid but not rejected as error
+ if feature_count == -1:
+ # make sure layer is read from beginning
+ OGR_L_ResetReading(ogr_layer)
+
+ feature_count = 0
+ while True:
+ try:
+ ogr_feature = exc_wrap_pointer(OGR_L_GetNextFeature(ogr_layer))
+ feature_count +=1
+
+ except NullPointerError:
+ # No more rows available, so stop reading
+ break
+
+ # driver may raise other errors, e.g., for OSM if node ids are not
+ # increasing, the default config option OSM_USE_CUSTOM_INDEXING=YES
+ # causes errors iterating over features
+ except CPLE_BaseError as exc:
+ # if an invalid where clause is used for a GPKG file, it is not
+ # caught as an error until attempting to iterate over features;
+ # catch it here
+ if "failed to prepare SQL" in str(exc):
+ raise ValueError(f"Invalid SQL query: {str(exc)}") from None
+
+ raise DataLayerError(f"Could not iterate over features: {str(exc)}") from None
+
+ finally:
+ if ogr_feature != NULL:
+ OGR_F_Destroy(ogr_feature)
+ ogr_feature = NULL
+
+ return feature_count
+
+
 cdef set_metadata(GDALMajorObjectH obj, object metadata):
  """Set metadata on a dataset or layer
 
@@ -371,13 +438,19 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
  -------
  str or None
  """
+
  if OGR_L_TestCapability(ogr_layer, OLCStringsAsUTF8):
  return 'UTF-8'
 
  driver = get_driver(ogr_dataset)
  if driver == 'ESRI Shapefile':
  return 'ISO-8859-1'
 
+ if driver == "OSM":
+ # always set OSM data to UTF-8
+ # per https://help.openstreetmap.org/questions/2172/what-encoding-does-openstreetmap-use
+ return "UTF-8"
+
  return None
 
 
@@ -486,11 +559,10 @@ cdef apply_where_filter(OGRLayerH ogr_layer, str where):
  if err != OGRERR_NONE:
  try:
  exc_check()
- name = OGR_L_GetName(ogr_layer)
  except CPLE_BaseError as exc:
  raise ValueError(str(exc))
 
- raise ValueError(f"Invalid SQL query for layer '{name}': '{where}'")
+ raise ValueError(f"Invalid SQL query for layer '{OGR_L_GetName(ogr_layer)}': '{where}'")
 
 
 cdef apply_spatial_filter(OGRLayerH ogr_layer, bbox):
@@ -526,11 +598,10 @@ cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_fe
  skip_features : number of features to skip from beginning of available range
  max_features : maximum number of features to read from available range
  """
- feature_count = OGR_L_GetFeatureCount(ogr_layer, 1)
+ feature_count = get_feature_count(ogr_layer)
  num_features = max_features
 
- if feature_count <= 0:
- # the count comes back as -1 if the where clause above is invalid but not rejected as error
+ if feature_count == 0:
  name = OGR_L_GetName(ogr_layer)
  warnings.warn(f"Layer '{name}' does not have any features to read")
  return 0, 0
@@ -736,9 +807,6 @@ cdef get_features(
  break
 
  except CPLE_BaseError as exc:
- if "failed to prepare SQL" in str(exc):
- raise ValueError(f"Invalid SQL query") from exc
-
  raise FeatureError(str(exc))
 
  if i >= num_features:
@@ -886,10 +954,7 @@ cdef get_bounds(
  break
 
  except CPLE_BaseError as exc:
- if "failed to prepare SQL" in str(exc):
- raise ValueError(f"Invalid SQL query") from exc
- else:
- raise FeatureError(str(exc))
+ raise FeatureError(str(exc))
 
  if i >= num_features:
  raise FeatureError(
@@ -1327,7 +1392,7 @@ def ogr_read_info(
  'fields': fields[:,2], # return only names
  'dtypes': fields[:,3],
  'geometry_type': get_geometry_type(ogr_layer),
- 'features': OGR_L_GetFeatureCount(ogr_layer, 1),
+ 'features': get_feature_count(ogr_layer),
  'driver': get_driver(ogr_dataset),
  "capabilities": {
  "random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead),

diff --git a/pyogrio/tests/fixtures/README.md b/pyogrio/tests/fixtures/README.md
@@ -83,3 +83,7 @@ This was extracted from https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrogra
 ```bash
 ogr2ogr test_mixed_surface.gpkg NHDPLUS_H_0308_HU4_GDB.gdb NHDWaterbody -where '"NHDPlusID" = 15000300070477' -select "NHDPlusID"
 ```
+
+### OSM PBF test
+
+This was downloaded from https://github.com/openstreetmap/OSM-binary/blob/master/resources/sample.pbf
diff --git a/pyogrio/tests/fixtures/sample.osm.pbf b/pyogrio/tests/fixtures/sample.osm.pbf
diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py
@@ -12,7 +12,7 @@
  get_gdal_config_option,
  get_gdal_data_path,
 )
-from pyogrio.errors import DataSourceError
+from pyogrio.errors import DataSourceError, DataLayerError
 
 from pyogrio._env import GDALEnv
 
@@ -266,6 +266,23 @@ def test_read_info_invalid_dataset_kwargs(naturalearth_lowres):
  read_info(naturalearth_lowres, INVALID="YES")
 
 
+def test_read_info_force_feature_count_exception(data_dir):
+ with pytest.raises(DataLayerError, match="Could not iterate over features"):
+ read_info(data_dir / "sample.osm.pbf", layer="lines")
+
+
+def test_read_info_force_feature_count(data_dir):
+ # the sample OSM file has non-increasing node IDs which causes the default
+ # custom indexing to raise an exception iterating over features
+ meta = read_info(data_dir / "sample.osm.pbf", USE_CUSTOM_INDEXING=False)
+ assert meta["features"] == 8
+
+ meta = read_info(
+ data_dir / "sample.osm.pbf", layer="lines", USE_CUSTOM_INDEXING=False
+ )
+ assert meta["features"] == 36
+
+
 @pytest.mark.parametrize(
  "name,value,expected",
  [