diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md index a5f21ca1..a35bbe1d 100644 --- a/docs/source/known_issues.md +++ b/docs/source/known_issues.md @@ -102,7 +102,8 @@ df = read_dataframe(path, INTERLEAVED_READING=True) We recommend the following to sidestep performance issues: -- always download remote OSM data sources to local files before attempting +- download remote OSM data sources to local files before attempting to read +- the `use_arrow=True` option may speed up reading from OSM files - if possible, use a different tool such as `ogr2ogr` to translate the OSM data source into a more performant format for reading by layer, such as GPKG diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index cbef8f68..26582a67 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -438,6 +438,7 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer): ------- str or None """ + if OGR_L_TestCapability(ogr_layer, OLCStringsAsUTF8): return 'UTF-8' @@ -445,6 +446,11 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer): if driver == 'ESRI Shapefile': return 'ISO-8859-1' + if driver == "OSM": + # always set OSM data to UTF-8 + # per https://help.openstreetmap.org/questions/2172/what-encoding-does-openstreetmap-use + return "UTF-8" + return None