-
Notifications
You must be signed in to change notification settings - Fork 23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: Calculate feature count ourselves if GDAL returns unknown count (-1), better support OSM data #271
ENH: Calculate feature count ourselves if GDAL returns unknown count (-1), better support OSM data #271
Changes from all commits
ee02917
cfa3409
d8711b3
f1ca42e
eaf75e9
31d7df3
03e0e77
d4630f3
4701cc2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -195,6 +195,17 @@ cdef OGRLayerH get_ogr_layer(GDALDatasetH ogr_dataset, layer) except NULL: | |
except CPLE_BaseError as exc: | ||
raise DataLayerError(str(exc)) | ||
|
||
# if the driver is OSM, we need to execute SQL to set the layer to read in | ||
# order to read it properly | ||
Comment on lines
+198
to
+199
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this something to report to GDAL? Or is it expected we need to set this differently? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
if get_driver(ogr_dataset) == "OSM": | ||
# Note: this returns NULL and does not need to be freed via | ||
# GDALDatasetReleaseResultSet() | ||
layer_name = get_string(OGR_L_GetName(ogr_layer)) | ||
sql_b = f"SET interest_layers = {layer_name}".encode('utf-8') | ||
sql_c = sql_b | ||
|
||
GDALDatasetExecuteSQL(ogr_dataset, sql_c, NULL, NULL) | ||
|
||
return ogr_layer | ||
|
||
|
||
|
@@ -309,6 +320,62 @@ cdef get_driver(OGRDataSourceH ogr_dataset): | |
return driver | ||
|
||
|
||
cdef get_feature_count(OGRLayerH ogr_layer): | ||
"""Get the feature count of a layer. | ||
|
||
If GDAL returns an unknown count (-1), this iterates over every feature | ||
to calculate the count. | ||
|
||
Parameters | ||
---------- | ||
ogr_layer : pointer to open OGR layer | ||
|
||
Returns | ||
------- | ||
int | ||
count of features | ||
""" | ||
|
||
cdef OGRFeatureH ogr_feature = NULL | ||
cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, 1) | ||
|
||
# if GDAL refuses to give us the feature count, we have to loop over all | ||
# features ourselves and get the count. This can happen for some drivers | ||
# (e.g., OSM) or if a where clause is invalid but not rejected as error | ||
if feature_count == -1: | ||
# make sure layer is read from beginning | ||
OGR_L_ResetReading(ogr_layer) | ||
|
||
feature_count = 0 | ||
while True: | ||
try: | ||
ogr_feature = exc_wrap_pointer(OGR_L_GetNextFeature(ogr_layer)) | ||
feature_count +=1 | ||
|
||
except NullPointerError: | ||
# No more rows available, so stop reading | ||
break | ||
|
||
# driver may raise other errors, e.g., for OSM if node ids are not | ||
# increasing, the default config option OSM_USE_CUSTOM_INDEXING=YES | ||
# causes errors iterating over features | ||
except CPLE_BaseError as exc: | ||
# if an invalid where clause is used for a GPKG file, it is not | ||
# caught as an error until attempting to iterate over features; | ||
# catch it here | ||
if "failed to prepare SQL" in str(exc): | ||
raise ValueError(f"Invalid SQL query: {str(exc)}") from None | ||
|
||
raise DataLayerError(f"Could not iterate over features: {str(exc)}") from None | ||
|
||
finally: | ||
if ogr_feature != NULL: | ||
OGR_F_Destroy(ogr_feature) | ||
ogr_feature = NULL | ||
|
||
return feature_count | ||
|
||
|
||
cdef set_metadata(GDALMajorObjectH obj, object metadata): | ||
"""Set metadata on a dataset or layer | ||
|
||
|
@@ -371,13 +438,19 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer): | |
------- | ||
str or None | ||
""" | ||
|
||
if OGR_L_TestCapability(ogr_layer, OLCStringsAsUTF8): | ||
return 'UTF-8' | ||
|
||
driver = get_driver(ogr_dataset) | ||
if driver == 'ESRI Shapefile': | ||
return 'ISO-8859-1' | ||
|
||
if driver == "OSM": | ||
# always set OSM data to UTF-8 | ||
# per https://help.openstreetmap.org/questions/2172/what-encoding-does-openstreetmap-use | ||
return "UTF-8" | ||
|
||
return None | ||
|
||
|
||
|
@@ -486,11 +559,10 @@ cdef apply_where_filter(OGRLayerH ogr_layer, str where): | |
if err != OGRERR_NONE: | ||
try: | ||
exc_check() | ||
name = OGR_L_GetName(ogr_layer) | ||
except CPLE_BaseError as exc: | ||
raise ValueError(str(exc)) | ||
|
||
raise ValueError(f"Invalid SQL query for layer '{name}': '{where}'") | ||
raise ValueError(f"Invalid SQL query for layer '{OGR_L_GetName(ogr_layer)}': '{where}'") | ||
|
||
|
||
cdef apply_spatial_filter(OGRLayerH ogr_layer, bbox): | ||
|
@@ -526,11 +598,10 @@ cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_fe | |
skip_features : number of features to skip from beginning of available range | ||
max_features : maximum number of features to read from available range | ||
""" | ||
feature_count = OGR_L_GetFeatureCount(ogr_layer, 1) | ||
feature_count = get_feature_count(ogr_layer) | ||
num_features = max_features | ||
|
||
if feature_count <= 0: | ||
# the count comes back as -1 if the where clause above is invalid but not rejected as error | ||
if feature_count == 0: | ||
name = OGR_L_GetName(ogr_layer) | ||
warnings.warn(f"Layer '{name}' does not have any features to read") | ||
return 0, 0 | ||
|
@@ -736,9 +807,6 @@ cdef get_features( | |
break | ||
|
||
except CPLE_BaseError as exc: | ||
if "failed to prepare SQL" in str(exc): | ||
raise ValueError(f"Invalid SQL query") from exc | ||
|
||
raise FeatureError(str(exc)) | ||
|
||
if i >= num_features: | ||
|
@@ -886,10 +954,7 @@ cdef get_bounds( | |
break | ||
|
||
except CPLE_BaseError as exc: | ||
if "failed to prepare SQL" in str(exc): | ||
raise ValueError(f"Invalid SQL query") from exc | ||
else: | ||
raise FeatureError(str(exc)) | ||
raise FeatureError(str(exc)) | ||
|
||
if i >= num_features: | ||
raise FeatureError( | ||
|
@@ -1327,7 +1392,7 @@ def ogr_read_info( | |
'fields': fields[:,2], # return only names | ||
'dtypes': fields[:,3], | ||
'geometry_type': get_geometry_type(ogr_layer), | ||
'features': OGR_L_GetFeatureCount(ogr_layer, 1), | ||
'features': get_feature_count(ogr_layer), | ||
'driver': get_driver(ogr_dataset), | ||
"capabilities": { | ||
"random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You could also mention use_arrow as an option. Based on my 1 test this is still not fast, but half as slow as without use_arrow=True. Or is it still too exprimental?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm seeing different results for Alabama (https://download.geofabrik.de/north-america/us/alabama-latest.osm.pbf) reading the
lines
layer:local, with arrow: 4.49s
local, without arrow: 3.46s
remote, with arrow: 23.55s
remote, without arrow: 20.26s
times are greater for
multipolygons
layer but follow similar trend locally but not for remote (likely because of much more network I/O, but it is remote enough from me that there could be huge variability in network I/O):local, with arrow: 6.82s
local, without arrow: 5.50s
remote, with arrow: 33.85s
remote, without arrow: 39.06s
whereas for points, the first and fastest layer:
local, with arrow 1.12s
local, without arrow: 0.68s
remote, with arrow: 40.36s
remote, without arrow: 20.09s
I had expected the arrow option to be somewhat faster, but that doesn't seem to be the case for me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm... my timings are indeed different. They are consistently faster for use_arrow, and most of the time twice as fast.
Then again, the timings are also spectacularly longer for me for local files, most likely the difference between Windows and Linux... because CI tests also run a lot slower on Windows vs Linux :-(... To conclude, arrow seems less dramatically slow on Windows ;-).
Results:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PS: I had to specify
encoding="utf8"
to be able to run it withuse_arrow=False
, probably because it is German data and they use quite some accents and stuff. Based on the following post and others, it seems osm data is always in "UTF-8":https://help.openstreetmap.org/questions/5308/character-encoding
So I think it might be better to default the
encoding
to "UTF-8" for .osm and .pbf?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a fix to always set
UTF-8
.It looks like using the interleaved encoding option has an impact on performance:
With default (no interleaved reading):
with
pyogrio.set_gdal_config_options({"OGR_INTERLEAVED_READING": True})
:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm... interleaved reading is consitently quite a bit faster... and often
use_arrow=False
becomes faster thanuse_arrow=True
because of it. Kind of weird.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On windows, a similar pattern:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if there is a reason why interleaved isn't the default... maybe it is also useful to mention that it can improve performance?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am seeing the same behaviour. It's indeed a bit strange that with the interleaved reading turned on, reading with arrow is slower than without. Might be worth reporting to GDAL