geopandas · jorisvandenbossche · Apr 26, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 6, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -9,7 +9,6 @@
     including the CRS, when using GDAL 3.8 or higher (#366).
 -   Warn when reading from a multilayer file without specifying a layer (#362).
 
-
 ### Bug fixes
 
 -   Fix error in `write_dataframe` if input has a date column and
@@ -19,6 +18,9 @@
 -   Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if
     a boolean column is detected due to error in GDAL reading boolean values (#335)
     this has been fixed in GDAL >= 3.8.3.
+-   Properly handle decoding of ESRI Shapefiles with user-provided `encoding`
+    option for `read`, `read_dataframe`, and `open_arrow`, and correctly encode
+    Shapefile field names and text values to the user-provided `encoding` (#384).
 
 ### Packaging
 

diff --git a/docs/source/introduction.md b/docs/source/introduction.md
@@ -89,6 +89,10 @@ first layer.
 }
 ```
 
+NOTE: pyogrio will report `UTF-8` if either the native encoding is likely to be
+`UTF-8` or GDAL can automatically convert from the detected native encoding to
+`UTF-8`.
+
 To read from a layer using name or index (the following are equivalent):
 
 ```python
@@ -468,21 +472,21 @@ You can also read from a URL with this syntax:
 
 GDAL only supports datetimes at a millisecond resolution. Reading data will thus
 give at most millisecond resolution (`datetime64[ms]` data type). With pandas 2.0
-`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]` 
-correspondingly. For previous versions of pandas, `datetime64[ns]` is used as 
-ms precision was not supported. When writing, only precision up to 
+`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]`
+correspondingly. For previous versions of pandas, `datetime64[ns]` is used as
+ms precision was not supported. When writing, only precision up to
 ms is retained.
 
 Not all file formats have dedicated support to store datetime data, like ESRI
 Shapefile. For such formats, or if you require precision > ms, a workaround is to
 convert the datetimes to string.
 
 Timezone information is preserved where possible, however GDAL only represents
-time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or 
-`zoneinfo`). This means that dataframes with columns containing multiple offsets 
+time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or
+`zoneinfo`). This means that dataframes with columns containing multiple offsets
 (e.g. when switching from standard time to summer time) will be written correctly,
-but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime 
-column, as there is no way to reconstruct the original timezone from the individual 
+but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime
+column, as there is no way to reconstruct the original timezone from the individual
 offsets present.
 
 ## Dataset and layer creation options

diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md
@@ -41,10 +41,18 @@ geometries from the data layer.
 
 ## Character encoding
 
-Pyogrio supports reading / writing data layers with a defined encoding. However,
-DataFrames do not currently allow arbitrary metadata, which means that we are
-currently unable to store encoding information for a data source. Text fields
-are read into Python UTF-8 strings.
+Pyogrio supports reading / writing data layers with a defined encoding. Where
+possible and the `encoding` option is not specified, GDAL will attempt to
+automatically decode from the native encoding to `UTF-8`, and pyogrio will report
+that the encoding is `UTF-8` in that case instead of the native encoding. For
+[ESRI Shapefiles](https://gdal.org/drivers/vector/shapefile.html#encoding),
+GDAL will use the associated `.cpg` file or a code page specified in the `.dbf`
+file to infer the native encoding, but may incorrectly assume the native encoding
+is `ISO-8859-1`, leading to miscoding errors. Most other drivers are assumed to
+be in `UTF-8`, but it is possible (in theory) to specify the `encoding` parameter
+manually to force conversions to use the specified encoding value.
+
+Field names and values are read into Python `UTF-8` strings.
 
 ## No validation of geometry or field types
 

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -486,6 +486,9 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
         # read without recoding. Hence, it is up to you to supply the data in the
         # appropriate encoding. More info:
         # https://gdal.org/development/rfc/rfc23_ogr_unicode.html#oftstring-oftstringlist-fields
+        # NOTE: this always returns False for the layer returned by executing SQL,
+        # even when they indeed support UTF-8; the layer underying the SQL must
+        # be passed instead
         return "UTF-8"
 
     driver = get_driver(ogr_dataset)
@@ -1109,8 +1112,12 @@ def ogr_read(
     cdef char **fields_c = NULL
     cdef OGRDataSourceH ogr_dataset = NULL
     cdef OGRLayerH ogr_layer = NULL
+    cdef OGRLayerH base_layer = NULL
     cdef int feature_count = 0
     cdef double xmin, ymin, xmax, ymax
+    cdef const char* prev_shape_encoding_c = NULL
+    cdef char* prev_shape_encoding = NULL
+    cdef bint override_shape_encoding = False
 
     path_b = path.encode('utf-8')
     path_c = path_b
@@ -1142,9 +1149,35 @@ def ogr_read(
         raise ValueError("'max_features' must be >= 0")
 
     try:
+        if encoding:
+            override_shape_encoding = True
+
+            # for shapefiles, SHAPE_ENCODING must be set before opening the file
+            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
+            # SHAPE_ENCODING so that it can be restored later
+            # (we do this for all data sources where encoding is set because
+            # we don't know the driver until after it is opened, which is too late)
+            prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL)
+            if prev_shape_encoding_c != NULL:
+                # strings returned from config options may be replaced via
+                # CPLSetConfigOption() below; GDAL instructs us to save a copy
+                # in a new string
+                prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
+
+            # set encoding used automatically by GDAL
+            encoding_b = encoding.encode('UTF-8')
+            encoding_c = encoding_b
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", <const char*>encoding_c)
+
+
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
+        if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
+            # Because SHAPE_ENCODING is set above, GDAL will automatically decode
+            # to UTF-8
+            encoding = "UTF-8"
+
         if sql is None:
             if layer is None:
                 layer = get_default_layer(ogr_dataset)
@@ -1156,7 +1189,22 @@ def ogr_read(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+        if encoding:
+            if get_driver(ogr_dataset) == "ESRI Shapefile":
+                # Because SHAPE_ENCODING is set above, GDAL will automatically decode
+                # to UTF-8; ignore any encoding set by user
+                encoding = "UTF-8"
+
+        else:
+            if sql is not None and get_driver(ogr_dataset) == "ESRI Shapefile":
+                # in order to properly test certain capabilities, we need to have the
+                # underlying layer referenced by the SQL query
+                base_layer = GDALDatasetGetLayer(ogr_dataset, 0)
+
+            else:
+                base_layer = ogr_layer
+
+            encoding = detect_encoding(ogr_dataset, base_layer)
 
         fields = get_fields(ogr_layer, encoding)
 
@@ -1249,6 +1297,13 @@ def ogr_read(
             GDALClose(ogr_dataset)
             ogr_dataset = NULL
 
+        # reset SHAPE_ENCODING config parameter if temporarily set above
+        if override_shape_encoding:
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+
+            if prev_shape_encoding != NULL:
+                CPLFree(prev_shape_encoding)
+
     return (
         meta,
         fid_data,
@@ -1282,9 +1337,13 @@ def ogr_open_arrow(
     cdef const char *where_c = NULL
     cdef OGRDataSourceH ogr_dataset = NULL
     cdef OGRLayerH ogr_layer = NULL
+    cdef OGRLayerH base_layer = NULL
     cdef char **fields_c = NULL
     cdef const char *field_c = NULL
     cdef char **options = NULL
+    cdef const char* prev_shape_encoding_c = NULL
+    cdef char* prev_shape_encoding = NULL
+    cdef bint override_shape_encoding = False
     cdef ArrowArrayStream stream
     cdef ArrowSchema schema
 
@@ -1328,6 +1387,26 @@ def ogr_open_arrow(
 
     reader = None
     try:
+        if encoding:
+            override_shape_encoding = True
+
+            # for shapefiles, SHAPE_ENCODING must be set before opening the file
+            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
+            # SHAPE_ENCODING so that it can be restored later
+            # (we do this for all data sources where encoding is set because
+            # we don't know the driver until after it is opened, which is too late)
+            prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL)
+            if prev_shape_encoding_c != NULL:
+                # strings returned from config options may be replaced via
+                # CPLSetConfigOption() below; GDAL instructs us to save a copy
+                # in a new string
+                prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
+
+            # set encoding used automatically by GDAL
+            encoding_b = encoding.encode('UTF-8')
+            encoding_c = encoding_b
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", <const char*>encoding_c)
+
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
@@ -1342,7 +1421,22 @@ def ogr_open_arrow(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+        if encoding:
+            if get_driver(ogr_dataset) == "ESRI Shapefile":
+                # Because SHAPE_ENCODING is set above, GDAL will automatically decode
+                # to UTF-8; ignore any encoding set by user
+                encoding = "UTF-8"
+
+        else:
+            if sql is not None and get_driver(ogr_dataset) == "ESRI Shapefile":
+                # in order to properly test certain capabilities, we need to have the
+                # underlying layer referenced by the SQL query
+                base_layer = GDALDatasetGetLayer(ogr_dataset, 0)
+
+            else:
+                base_layer = ogr_layer
+
+            encoding = detect_encoding(ogr_dataset, base_layer)
 
         fields = get_fields(ogr_layer, encoding, use_arrow=True)
 
@@ -1450,6 +1544,13 @@ def ogr_open_arrow(
             GDALClose(ogr_dataset)
             ogr_dataset = NULL
 
+        # reset SHAPE_ENCODING config parameter if temporarily set above
+        if override_shape_encoding:
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+
+            if prev_shape_encoding != NULL:
+                CPLFree(<void*>prev_shape_encoding)
+
 def ogr_read_bounds(
     str path,
     object layer=None,
@@ -1518,12 +1619,34 @@ def ogr_read_info(
     cdef char **dataset_options = NULL
     cdef OGRDataSourceH ogr_dataset = NULL
     cdef OGRLayerH ogr_layer = NULL
+    cdef const char* prev_shape_encoding = NULL
+    cdef bint override_shape_encoding = False
 
     path_b = path.encode('utf-8')
     path_c = path_b
 
-
     try:
+        if encoding:
+            override_shape_encoding = True
+
+            # for shapefiles, SHAPE_ENCODING must be set before opening the file
+            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
+            # SHAPE_ENCODING so that it can be restored later
+            # (we do this for all data sources where encoding is set because
+            # we don't know the driver until after it is opened, which is too late)
+            prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL)
+            if prev_shape_encoding_c != NULL:
+                # strings returned from config options may be replaced via
+                # CPLSetConfigOption() below; GDAL instructs us to save a copy
+                # in a new string
+                prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
+
+            # set encoding used automatically by GDAL
+            encoding_b = encoding.encode('UTF-8')
+            encoding_c = encoding_b
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", <const char*>encoding_c)
+
+
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
@@ -1533,7 +1656,13 @@ def ogr_read_info(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+        if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
+            # Because SHAPE_ENCODING is set above, GDAL will automatically decode
+            # to UTF-8
+            encoding = "UTF-8"
+
+        else:
+            encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding)
 
@@ -1566,6 +1695,13 @@ def ogr_read_info(
             GDALClose(ogr_dataset)
             ogr_dataset = NULL
 
+        # reset SHAPE_ENCODING config parameter if temporarily set above
+        if override_shape_encoding:
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+
+            if prev_shape_encoding != NULL:
+                CPLFree(<void*>prev_shape_encoding)
+
     return meta
 
 
@@ -1600,7 +1736,7 @@ cdef str get_default_layer(OGRDataSourceH ogr_dataset):
     -------
     str
         the name of the default layer to be read.
-    
+
     """
     layers = get_layer_names(ogr_dataset)
     first_layer_name = layers[0][0]
@@ -1632,7 +1768,7 @@ cdef get_layer_names(OGRDataSourceH ogr_dataset):
     -------
     ndarray(n)
         array of layer names
-    
+
     """
     cdef OGRLayerH ogr_layer = NULL
 
@@ -1862,8 +1998,9 @@ cdef create_ogr_dataset_layer(
         # Setup layer creation options
 
         if driver == 'ESRI Shapefile':
-            # Fiona only sets encoding for shapefiles; other drivers do not support
-            # encoding as an option.
+            # ENCODING option must be set for shapefiles to properly write *.cpg
+            # file containing the encoding; this is not a supported option for
+            # other drivers
             if encoding is None:
                 encoding = "UTF-8"
             encoding_b = encoding.upper().encode('UTF-8')
@@ -1988,10 +2125,17 @@ def ogr_write(
         &ogr_dataset, &ogr_layer,
     )
 
-    # Now the dataset and layer have been created, we can properly determine the
-    # encoding. It is derived from the user, from the dataset capabilities / type,
-    # or from the system locale
-    encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+
+    if driver == 'ESRI Shapefile':
+        # force encoding for remaining operations to be in UTF-8 because
+        # GDAL will automatically convert those to the target encoding
+        encoding = "UTF-8"
+
+    else:
+        # Now the dataset and layer have been created, we can properly determine the
+        # encoding. It is derived from the user, from the dataset capabilities / type,
+        # or from the system locale
+        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
     ### Create the fields
     field_types = None

diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd
@@ -12,6 +12,9 @@ cdef extern from "cpl_conv.h":
     const char* CPLFindFile(const char *pszClass, const char *filename)
     const char* CPLGetConfigOption(const char* key, const char* value)
     void        CPLSetConfigOption(const char* key, const char* value)
+    const char* CPLGetThreadLocalConfigOption(const char* key, const char* value)
+    void        CPLSetThreadLocalConfigOption(const char* key, const char* value)
+    char*       CPLStrdup(const char* string)
 
 
 cdef extern from "cpl_error.h" nogil:

diff --git a/pyogrio/core.py b/pyogrio/core.py
@@ -217,6 +217,10 @@ def read_info(
     driver or if the data source is nonspatial. You can force it to be calculated using
     the ``force_total_bounds`` parameter.
 
+    ``encoding`` will be ``UTF-8`` if either the native encoding is likely to be
+    ``UTF-8`` or GDAL can automatically convert from the detected native encoding
+    to ``UTF-8``.
+
     Parameters
     ----------
     path : str or pathlib.Path