geopandas · brendan-ward · Aug 22, 2023 · Aug 15, 2023 · Aug 15, 2023 · Aug 16, 2023
diff --git a/CHANGES.md b/CHANGES.md
@@ -6,11 +6,18 @@
 
 -   Support writing dataframes without geometry column (#267)
 
+-   Calculate feature count by iterating over features if GDAL returns an
+    unknown count for a data layer (e.g., OSM driver); this may have signficant
+    performance impacts for some data sources that would otherwise return an
+    unknown count (count is used in `read_info`, `read`, `read_dataframe`) (#271).
+
 ### Bug fixes
 
 -   Fix int32 overflow when reading int64 columns (#260)
 -   Fix `fid_as_index=True` doesn't set fid as index using `read_dataframe` with
     `use_arrow=True` (#265)
+-   Fix errors reading OSM data due to invalid feature count and incorrect
+    reading of OSM layers beyond the first layer (#271)
 
 ## 0.6.0 (2023-04-27)
 

diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md
@@ -65,3 +65,45 @@ convert the datetimes to string.
 
 Timezone information is ignored at the moment, both when reading and when writing
 datetime columns.
+
+## Support for OpenStreetMap (OSM) data
+
+OpenStreetMap data do not natively support calculating the feature count by data
+layer due to the internal data structures. To get around this, Pyogrio iterates
+over all features first to calculate the feature count that is used to allocate
+arrays that contain the geometries and attributes read from the data layer, and
+then iterates over all feature again to populate those arrays. Further, data
+within the file are not structured at the top level to support fast reading by
+layer, which means that reading data by layer may need to read all records
+within the data source, not just those belonging to a particular layer. This is
+inefficient and slow, and is exacerbated when attemping to read from
+remotely-hosted data sources rather than local files.
+
+You may also be instructed by GDAL to enable interleaved reading mode via an
+error message when you try to read a large file without it, which you can do in
+one of two ways:
+
+1. Set config option used for all operations
+
+```python
+from pyogrio import set_gdal_config_options
+
+set_gdal_config_options({"OGR_INTERLEAVED_READING": True})
+```
+
+2. Set dataset open option
+
+```python
+
+from pyogrio import read_dataframe
+
+df = read_dataframe(path, INTERLEAVED_READING=True)
+```
+
+We recommend the following to sidestep performance issues:
+
+-   download remote OSM data sources to local files before attempting
+    to read
+-   the `use_arrow=True` option may speed up reading from OSM files
+-   if possible, use a different tool such as `ogr2ogr` to translate the OSM
+    data source into a more performant format for reading by layer, such as GPKG
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -195,6 +195,17 @@ cdef OGRLayerH get_ogr_layer(GDALDatasetH ogr_dataset, layer) except NULL:
     except CPLE_BaseError as exc:
         raise DataLayerError(str(exc))
 
+    # if the driver is OSM, we need to execute SQL to set the layer to read in
+    # order to read it properly
+    if get_driver(ogr_dataset) == "OSM":
+        # Note: this returns NULL and does not need to be freed via
+        # GDALDatasetReleaseResultSet()
+        layer_name = get_string(OGR_L_GetName(ogr_layer))
+        sql_b = f"SET interest_layers = {layer_name}".encode('utf-8')
+        sql_c = sql_b
+
+        GDALDatasetExecuteSQL(ogr_dataset, sql_c, NULL, NULL)
+
     return ogr_layer
 
 
@@ -309,6 +320,62 @@ cdef get_driver(OGRDataSourceH ogr_dataset):
     return driver
 
 
+cdef get_feature_count(OGRLayerH ogr_layer):
+    """Get the feature count of a layer.
+
+    If GDAL returns an unknown count (-1), this iterates over every feature
+    to calculate the count.
+
+    Parameters
+    ----------
+    ogr_layer : pointer to open OGR layer
+
+    Returns
+    -------
+    int
+        count of features
+    """
+
+    cdef OGRFeatureH ogr_feature = NULL
+    cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, 1)
+
+    # if GDAL refuses to give us the feature count, we have to loop over all
+    # features ourselves and get the count.  This can happen for some drivers
+    # (e.g., OSM) or if a where clause is invalid but not rejected as error
+    if feature_count == -1:
+        # make sure layer is read from beginning
+        OGR_L_ResetReading(ogr_layer)
+
+        feature_count = 0
+        while True:
+            try:
+                ogr_feature = exc_wrap_pointer(OGR_L_GetNextFeature(ogr_layer))
+                feature_count +=1
+
+            except NullPointerError:
+                # No more rows available, so stop reading
+                break
+
+            # driver may raise other errors, e.g., for OSM if node ids are not
+            # increasing, the default config option OSM_USE_CUSTOM_INDEXING=YES
+            # causes errors iterating over features
+            except CPLE_BaseError as exc:
+                # if an invalid where clause is used for a GPKG file, it is not
+                # caught as an error until attempting to iterate over features;
+                # catch it here
+                if "failed to prepare SQL" in str(exc):
+                    raise ValueError(f"Invalid SQL query: {str(exc)}") from None
+
+                raise DataLayerError(f"Could not iterate over features: {str(exc)}") from None
+
+            finally:
+                if ogr_feature != NULL:
+                    OGR_F_Destroy(ogr_feature)
+                    ogr_feature = NULL
+
+    return feature_count
+
+
 cdef set_metadata(GDALMajorObjectH obj, object metadata):
     """Set metadata on a dataset or layer
 
@@ -371,13 +438,19 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
     -------
     str or None
     """
+
     if OGR_L_TestCapability(ogr_layer, OLCStringsAsUTF8):
         return 'UTF-8'
 
     driver = get_driver(ogr_dataset)
     if driver == 'ESRI Shapefile':
         return 'ISO-8859-1'
 
+    if driver == "OSM":
+        # always set OSM data to UTF-8
+        # per https://help.openstreetmap.org/questions/2172/what-encoding-does-openstreetmap-use
+        return "UTF-8"
+
     return None
 
 
@@ -486,11 +559,10 @@ cdef apply_where_filter(OGRLayerH ogr_layer, str where):
     if err != OGRERR_NONE:
         try:
             exc_check()
-            name = OGR_L_GetName(ogr_layer)
         except CPLE_BaseError as exc:
             raise ValueError(str(exc))
 
-        raise ValueError(f"Invalid SQL query for layer '{name}': '{where}'")
+        raise ValueError(f"Invalid SQL query for layer '{OGR_L_GetName(ogr_layer)}': '{where}'")
 
 
 cdef apply_spatial_filter(OGRLayerH ogr_layer, bbox):
@@ -526,11 +598,10 @@ cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_fe
     skip_features : number of features to skip from beginning of available range
     max_features : maximum number of features to read from available range
     """
-    feature_count = OGR_L_GetFeatureCount(ogr_layer, 1)
+    feature_count = get_feature_count(ogr_layer)
     num_features = max_features
 
-    if feature_count <= 0:
-        # the count comes back as -1 if the where clause above is invalid but not rejected as error
+    if feature_count == 0:
         name = OGR_L_GetName(ogr_layer)
         warnings.warn(f"Layer '{name}' does not have any features to read")
         return 0, 0
@@ -736,9 +807,6 @@ cdef get_features(
                 break
 
             except CPLE_BaseError as exc:
-                if "failed to prepare SQL" in str(exc):
-                    raise ValueError(f"Invalid SQL query") from exc
-
                 raise FeatureError(str(exc))
 
             if i >= num_features:
@@ -886,10 +954,7 @@ cdef get_bounds(
                 break
 
             except CPLE_BaseError as exc:
-                if "failed to prepare SQL" in str(exc):
-                    raise ValueError(f"Invalid SQL query") from exc
-                else:
-                    raise FeatureError(str(exc))
+                raise FeatureError(str(exc))
 
             if i >= num_features:
                 raise FeatureError(
@@ -1327,7 +1392,7 @@ def ogr_read_info(
             'fields': fields[:,2], # return only names
             'dtypes': fields[:,3],
             'geometry_type': get_geometry_type(ogr_layer),
-            'features': OGR_L_GetFeatureCount(ogr_layer, 1),
+            'features': get_feature_count(ogr_layer),
             'driver': get_driver(ogr_dataset),
             "capabilities": {
                 "random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead),

diff --git a/pyogrio/tests/fixtures/README.md b/pyogrio/tests/fixtures/README.md
@@ -83,3 +83,7 @@ This was extracted from https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrogra
 ```bash
 ogr2ogr test_mixed_surface.gpkg NHDPLUS_H_0308_HU4_GDB.gdb NHDWaterbody -where '"NHDPlusID" = 15000300070477' -select "NHDPlusID"
 ```
+
+### OSM PBF test
+
+This was downloaded from https://github.com/openstreetmap/OSM-binary/blob/master/resources/sample.pbf
diff --git a/pyogrio/tests/fixtures/sample.osm.pbf b/pyogrio/tests/fixtures/sample.osm.pbf
diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py
@@ -12,7 +12,7 @@
     get_gdal_config_option,
     get_gdal_data_path,
 )
-from pyogrio.errors import DataSourceError
+from pyogrio.errors import DataSourceError, DataLayerError
 
 from pyogrio._env import GDALEnv
 
@@ -266,6 +266,23 @@ def test_read_info_invalid_dataset_kwargs(naturalearth_lowres):
         read_info(naturalearth_lowres, INVALID="YES")
 
 
+def test_read_info_force_feature_count_exception(data_dir):
+    with pytest.raises(DataLayerError, match="Could not iterate over features"):
+        read_info(data_dir / "sample.osm.pbf", layer="lines")
+
+
+def test_read_info_force_feature_count(data_dir):
+    # the sample OSM file has non-increasing node IDs which causes the default
+    # custom indexing to raise an exception iterating over features
+    meta = read_info(data_dir / "sample.osm.pbf", USE_CUSTOM_INDEXING=False)
+    assert meta["features"] == 8
+
+    meta = read_info(
+        data_dir / "sample.osm.pbf", layer="lines", USE_CUSTOM_INDEXING=False
+    )
+    assert meta["features"] == 36
+
+
 @pytest.mark.parametrize(
     "name,value,expected",
     [