geopandas · jorisvandenbossche · Apr 26, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 6, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -9,7 +9,6 @@
     including the CRS, when using GDAL 3.8 or higher (#366).
 -   Warn when reading from a multilayer file without specifying a layer (#362).
 
-
 ### Bug fixes
 
 -   Fix error in `write_dataframe` if input has a date column and
@@ -19,6 +18,9 @@
 -   Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if
     a boolean column is detected due to error in GDAL reading boolean values (#335)
     this has been fixed in GDAL >= 3.8.3.
+-   Properly handle decoding of ESRI Shapefiles with user-provided `encoding`
+    option for `read`, `read_dataframe`, and `open_arrow`, and correctly encode
+    Shapefile field names and text values to the user-provided `encoding` (#384).
 
 ### Packaging
 

diff --git a/docs/source/introduction.md b/docs/source/introduction.md
@@ -89,6 +89,10 @@ first layer.
 }
 ```
 
+NOTE: pyogrio will report `UTF-8` if either the native encoding is likely to be
+`UTF-8` or GDAL can automatically convert from the detected native encoding to
+`UTF-8`.
+
 To read from a layer using name or index (the following are equivalent):
 
 ```python
@@ -468,21 +472,21 @@ You can also read from a URL with this syntax:
 
 GDAL only supports datetimes at a millisecond resolution. Reading data will thus
 give at most millisecond resolution (`datetime64[ms]` data type). With pandas 2.0
-`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]` 
-correspondingly. For previous versions of pandas, `datetime64[ns]` is used as 
-ms precision was not supported. When writing, only precision up to 
+`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]`
+correspondingly. For previous versions of pandas, `datetime64[ns]` is used as
+ms precision was not supported. When writing, only precision up to
 ms is retained.
 
 Not all file formats have dedicated support to store datetime data, like ESRI
 Shapefile. For such formats, or if you require precision > ms, a workaround is to
 convert the datetimes to string.
 
 Timezone information is preserved where possible, however GDAL only represents
-time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or 
-`zoneinfo`). This means that dataframes with columns containing multiple offsets 
+time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or
+`zoneinfo`). This means that dataframes with columns containing multiple offsets
 (e.g. when switching from standard time to summer time) will be written correctly,
-but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime 
-column, as there is no way to reconstruct the original timezone from the individual 
+but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime
+column, as there is no way to reconstruct the original timezone from the individual
 offsets present.
 
 ## Dataset and layer creation options

diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md
@@ -41,10 +41,18 @@ geometries from the data layer.
 
 ## Character encoding
 
-Pyogrio supports reading / writing data layers with a defined encoding. However,
-DataFrames do not currently allow arbitrary metadata, which means that we are
-currently unable to store encoding information for a data source. Text fields
-are read into Python UTF-8 strings.
+Pyogrio supports reading / writing data layers with a defined encoding. Where
+possible and the `encoding` option is not specified, GDAL will attempt to
+automatically decode from the native encoding to `UTF-8`, and pyogrio will report
+that the encoding is `UTF-8` in that case instead of the native encoding. For
+[ESRI Shapefiles](https://gdal.org/drivers/vector/shapefile.html#encoding),
+GDAL will use the associated `.cpg` file or a code page specified in the `.dbf`
+file to infer the native encoding, but may incorrectly assume the native encoding
+is `ISO-8859-1`, leading to miscoding errors. Most other drivers are assumed to
+be in `UTF-8`, but it is possible (in theory) to specify the `encoding` parameter
+manually to force conversions to use the specified encoding value.
+
+Field names and values are read into Python `UTF-8` strings.
 
 ## No validation of geometry or field types
 

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -134,6 +134,34 @@ cdef char** dict_to_options(object values):
     return options
 
 
+cdef const char* override_threadlocal_config_option(const char* key, const char* value):
+    """Set the CPLSetThreadLocalConfigOption for key=value
+
+    Parameters
+    ----------
+    key : const char*
+        python string must decoded to UTF-8 before calling this
+    value : const char*
+        python string must decoded to UTF-8 before calling this
+
+    Returns
+    -------
+    const char*
+        value previously set for key, so that it can be later restored.  Caller
+        is responsible for freeing this via CPLFree() if not NULL.
+    """
+    cdef const char *prev_value = CPLGetThreadLocalConfigOption(key, NULL)
+    if prev_value != NULL:
+        # strings returned from config options may be replaced via
+        # CPLSetConfigOption() below; GDAL instructs us to save a copy
+        # in a new string
+        prev_value = CPLStrdup(prev_value)
+
+    CPLSetThreadLocalConfigOption(key, value)
+
+    return prev_value
+
+
 cdef void* ogr_open(const char* path_c, int mode, char** options) except NULL:
     cdef void* ogr_dataset = NULL
 
@@ -486,13 +514,27 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
         # read without recoding. Hence, it is up to you to supply the data in the
         # appropriate encoding. More info:
         # https://gdal.org/development/rfc/rfc23_ogr_unicode.html#oftstring-oftstringlist-fields
+        # NOTE: for shapefiles, this always returns False for the layer returned
+        # when executing SQL, even when it supports UTF-8 (patched below);
+        # this may be fixed by https://github.com/OSGeo/gdal/pull/9649 (GDAL >=3.9.0?)
         return "UTF-8"
 
     driver = get_driver(ogr_dataset)
     if driver == "ESRI Shapefile":
-        # Typically, OGR_L_TestCapability returns True for OLCStringsAsUTF8 for ESRI
-        # Shapefile so this won't be reached. However, for old versions of GDAL and/or
-        # if libraries are missing, this fallback could be needed.
+        # OGR_L_TestCapability returns True for OLCStringsAsUTF8 (above) for
+        # shapefiles when a .cpg file is present with a valid encoding, or GDAL
+        # auto-detects the encoding from the code page of the .dbf file, or
+        # SHAPE_ENCODING config option is set, or ENCODING layer creation option
+        # is specified (shapefiles only).  Otherwise, we can only assume that
+        # shapefiles are in their default encoding of ISO-8859-1 (which may be
+        # incorrect and must be overridden by user-provided encoding)
+
+        # Always use the first layer to test capabilities until detection for
+        # SQL results from shapefiles are fixed (above)
+        # This block should only be used for unfixed versions of GDAL (<3.9.0?)
+        if OGR_L_TestCapability(GDALDatasetGetLayer(ogr_dataset, 0), OLCStringsAsUTF8):
+            return "UTF-8"
+
         return "ISO-8859-1"
 
     if driver == "OSM":
@@ -1111,6 +1153,8 @@ def ogr_read(
     cdef OGRLayerH ogr_layer = NULL
     cdef int feature_count = 0
     cdef double xmin, ymin, xmax, ymax
+    cdef const char *prev_shape_encoding = NULL
+    cdef bint override_shape_encoding = False
 
     path_b = path.encode('utf-8')
     path_c = path_b
@@ -1142,6 +1186,18 @@ def ogr_read(
         raise ValueError("'max_features' must be >= 0")
 
     try:
+        if encoding:
+            override_shape_encoding = True
+
+            # for shapefiles, SHAPE_ENCODING must be set before opening the file
+            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
+            # SHAPE_ENCODING so that it can be restored later
+            # (we do this for all data sources where encoding is set because
+            # we don't know the driver until after it is opened, which is too late)
+            encoding_b = encoding.encode("UTF-8")
+            encoding_c = encoding_b
+            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c)
+
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
@@ -1156,7 +1212,14 @@ def ogr_read(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+        if encoding:
+            if get_driver(ogr_dataset) == "ESRI Shapefile":
+                # Because SHAPE_ENCODING is set above, GDAL will automatically
+                # decode shapefiles to UTF-8; ignore any encoding set by user
+                encoding = "UTF-8"
+
+        else:
+            encoding = detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding)
 
@@ -1249,6 +1312,13 @@ def ogr_read(
             GDALClose(ogr_dataset)
             ogr_dataset = NULL
 
+        # reset SHAPE_ENCODING config parameter if temporarily set above
+        if override_shape_encoding:
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+
+            if prev_shape_encoding != NULL:
+                CPLFree(<void*>prev_shape_encoding)
+
     return (
         meta,
         fid_data,
@@ -1285,6 +1355,8 @@ def ogr_open_arrow(
     cdef char **fields_c = NULL
     cdef const char *field_c = NULL
     cdef char **options = NULL
+    cdef const char *prev_shape_encoding = NULL
+    cdef bint override_shape_encoding = False
     cdef ArrowArrayStream stream
     cdef ArrowSchema schema
 
@@ -1328,6 +1400,18 @@ def ogr_open_arrow(
 
     reader = None
     try:
+        if encoding:
+            override_shape_encoding = True
+
+            # for shapefiles, SHAPE_ENCODING must be set before opening the file
+            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
+            # SHAPE_ENCODING so that it can be restored later
+            # (we do this for all data sources where encoding is set because
+            # we don't know the driver until after it is opened, which is too late)
+            encoding_b = encoding.encode("UTF-8")
+            encoding_c = encoding_b
+            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c)
+
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
@@ -1342,7 +1426,14 @@ def ogr_open_arrow(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+        if encoding:
+            if get_driver(ogr_dataset) == "ESRI Shapefile":
+                # Because SHAPE_ENCODING is set above, GDAL will automatically
+                # decode shapefiles to UTF-8; ignore any encoding set by user
+                encoding = "UTF-8"
+
+        else:
+            encoding = detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding, use_arrow=True)
 
@@ -1450,6 +1541,13 @@ def ogr_open_arrow(
             GDALClose(ogr_dataset)
             ogr_dataset = NULL
 
+        # reset SHAPE_ENCODING config parameter if temporarily set above
+        if override_shape_encoding:
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+
+            if prev_shape_encoding != NULL:
+                CPLFree(<void*>prev_shape_encoding)
+
 def ogr_read_bounds(
     str path,
     object layer=None,
@@ -1518,12 +1616,26 @@ def ogr_read_info(
     cdef char **dataset_options = NULL
     cdef OGRDataSourceH ogr_dataset = NULL
     cdef OGRLayerH ogr_layer = NULL
+    cdef const char *prev_shape_encoding = NULL
+    cdef bint override_shape_encoding = False
 
     path_b = path.encode('utf-8')
     path_c = path_b
 
 
     try:
+        if encoding:
+            override_shape_encoding = True
+
+            # for shapefiles, SHAPE_ENCODING must be set before opening the file
+            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
+            # SHAPE_ENCODING so that it can be restored later
+            # (we do this for all data sources where encoding is set because
+            # we don't know the driver until after it is opened, which is too late)
+            encoding_b = encoding.encode("UTF-8")
+            encoding_c = encoding_b
+            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c)
+
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
@@ -1533,7 +1645,13 @@ def ogr_read_info(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+        if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
+            # Because SHAPE_ENCODING is set above, GDAL will automatically
+            # decode shapefiles to UTF-8; ignore any encoding set by user
+            encoding = "UTF-8"
+
+        else:
+            encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding)
 
@@ -1566,6 +1684,13 @@ def ogr_read_info(
             GDALClose(ogr_dataset)
             ogr_dataset = NULL
 
+        # reset SHAPE_ENCODING config parameter if temporarily set above
+        if override_shape_encoding:
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+
+            if prev_shape_encoding != NULL:
+                CPLFree(<void*>prev_shape_encoding)
+
     return meta
 
 
@@ -1862,8 +1987,9 @@ cdef create_ogr_dataset_layer(
         # Setup layer creation options
 
         if driver == 'ESRI Shapefile':
-            # Fiona only sets encoding for shapefiles; other drivers do not support
-            # encoding as an option.
+            # ENCODING option must be set for shapefiles to properly write *.cpg
+            # file containing the encoding; this is not a supported option for
+            # other drivers
             if encoding is None:
                 encoding = "UTF-8"
             encoding_b = encoding.upper().encode('UTF-8')
@@ -1988,10 +2114,18 @@ def ogr_write(
         &ogr_dataset, &ogr_layer,
     )
 
-    # Now the dataset and layer have been created, we can properly determine the
-    # encoding. It is derived from the user, from the dataset capabilities / type,
-    # or from the system locale
-    encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+
+    if driver == 'ESRI Shapefile':
+        # force encoding for remaining operations to be in UTF-8 (even if user
+        # provides an encoding) because GDAL will automatically convert those to
+        # the target encoding because ENCODING is set as a layer creation option
+        encoding = "UTF-8"
+
+    else:
+        # Now the dataset and layer have been created, we can properly determine the
+        # encoding. It is derived from the user, from the dataset capabilities / type,
+        # or from the system locale
+        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
     ### Create the fields
     field_types = None

diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd
@@ -12,6 +12,9 @@ cdef extern from "cpl_conv.h":
     const char* CPLFindFile(const char *pszClass, const char *filename)
     const char* CPLGetConfigOption(const char* key, const char* value)
     void        CPLSetConfigOption(const char* key, const char* value)
+    const char* CPLGetThreadLocalConfigOption(const char* key, const char* value)
+    void        CPLSetThreadLocalConfigOption(const char* key, const char* value)
+    char*       CPLStrdup(const char* string)
 
 
 cdef extern from "cpl_error.h" nogil:

diff --git a/pyogrio/core.py b/pyogrio/core.py
@@ -217,6 +217,10 @@ def read_info(
     driver or if the data source is nonspatial. You can force it to be calculated using
     the ``force_total_bounds`` parameter.
 
+    ``encoding`` will be ``UTF-8`` if either the native encoding is likely to be
+    ``UTF-8`` or GDAL can automatically convert from the detected native encoding
+    to ``UTF-8``.
+
     Parameters
     ----------
     path : str or pathlib.Path

diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
@@ -106,8 +106,8 @@ def read_dataframe(
         of the layer in the data source.  Defaults to first layer in data source.
     encoding : str, optional (default: None)
         If present, will be used as the encoding for reading string values from
-        the data source, unless encoding can be inferred directly from the data
-        source.
+        the data source.  By default will automatically try to detect the native
+        encoding and decode to ``UTF-8``.
     columns : list-like, optional (default: all columns)
         List of column names to import from the data source.  Column names must
         exactly match the names in the data source, and will be returned in
@@ -349,7 +349,8 @@ def write_dataframe(
         attempts to infer driver from path.
     encoding : str, optional (default: None)
         If present, will be used as the encoding for writing string values to
-        the file.
+        the file.  Use with caution, only certain drivers support encodings
+        other than UTF-8.
     geometry_type : string, optional (default: None)
         By default, the geometry type of the layer will be inferred from the
         data, after applying the promote_to_multi logic. If the data only contains a