From 2bfc862cf5a98642d9ae97748fbc98ce5d8e6f0d Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Thu, 4 Apr 2024 19:37:55 -0700
Subject: [PATCH 01/21] Use SHAPE_ENCODING to disable auto-decoding of
 shapefiles

---
 pyogrio/_io.pyx  | 92 ++++++++++++++++++++++++++++++++++++++++++++++--
 pyogrio/_ogr.pxd |  1 +
 2 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index 733064af..d4aa7b8a 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -1111,6 +1111,9 @@ def ogr_read(
     cdef OGRLayerH ogr_layer = NULL
     cdef int feature_count = 0
     cdef double xmin, ymin, xmax, ymax
+    cdef const char* prev_shape_encoding_c = NULL
+    cdef char* prev_shape_encoding = NULL
+    cdef bint override_shape_encoding = False
 
     path_b = path.encode('utf-8')
     path_c = path_b
@@ -1142,6 +1145,25 @@ def ogr_read(
         raise ValueError("'max_features' must be >= 0")
 
     try:
+        if encoding:
+            override_shape_encoding = True
+
+            # for shapefiles, SHAPE_ENCODING must be set before opening the file
+            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
+            # SHAPE_ENCODING so that it can be restored later
+            # (we do this for all data sources where encoding is set because
+            # we don't know the driver until after it is opened, which is too late)
+            prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL)
+            if prev_shape_encoding_c != NULL:
+                # strings returned from config options may be replaced via
+                # CPLSetConfigOption() below; GDAL instructs us to save a copy
+                # in a new string
+                prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
+
+            # set empty string to disable auto-decoding by GDAL to UTF-8
+            CPLSetConfigOption("SHAPE_ENCODING", "")
+
+
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
@@ -1156,6 +1178,7 @@ def ogr_read(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
+        # FIXME: it appears we need to set ENCODING sooner
         encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding)
@@ -1249,6 +1272,13 @@ def ogr_read(
             GDALClose(ogr_dataset)
             ogr_dataset = NULL
 
+        # reset SHAPE_ENCODING config parameter if temporarily set above
+        if override_shape_encoding:
+            CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+
+            if prev_shape_encoding != NULL:
+                CPLFree(prev_shape_encoding)
+
     return (
         meta,
         fid_data,
@@ -1285,6 +1315,9 @@ def ogr_open_arrow(
     cdef char **fields_c = NULL
     cdef const char *field_c = NULL
     cdef char **options = NULL
+    cdef const char* prev_shape_encoding_c = NULL
+    cdef char* prev_shape_encoding = NULL
+    cdef bint override_shape_encoding = False
     cdef ArrowArrayStream stream
     cdef ArrowSchema schema
 
@@ -1328,6 +1361,25 @@ def ogr_open_arrow(
 
     reader = None
     try:
+        if encoding:
+            override_shape_encoding = True
+
+            # for shapefiles, SHAPE_ENCODING must be set before opening the file
+            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
+            # SHAPE_ENCODING so that it can be restored later
+            # (we do this for all data sources where encoding is set because
+            # we don't know the driver until after it is opened, which is too late)
+            prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL)
+            if prev_shape_encoding_c != NULL:
+                # strings returned from config options may be replaced via
+                # CPLSetConfigOption() below; GDAL instructs us to save a copy
+                # in a new string
+                prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
+
+            # set empty string to disable auto-decoding by GDAL to UTF-8
+            CPLSetConfigOption("SHAPE_ENCODING", "")
+
+
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
@@ -1450,6 +1502,13 @@ def ogr_open_arrow(
             GDALClose(ogr_dataset)
             ogr_dataset = NULL
 
+        # reset SHAPE_ENCODING config parameter if temporarily set above
+        if override_shape_encoding:
+            CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+
+            if prev_shape_encoding != NULL:
+                CPLFree(prev_shape_encoding)
+
 def ogr_read_bounds(
     str path,
     object layer=None,
@@ -1518,12 +1577,32 @@ def ogr_read_info(
     cdef char **dataset_options = NULL
     cdef OGRDataSourceH ogr_dataset = NULL
     cdef OGRLayerH ogr_layer = NULL
+    cdef const char* prev_shape_encoding = NULL
+    cdef bint override_shape_encoding = False
 
     path_b = path.encode('utf-8')
     path_c = path_b
 
-    
     try:
+        if encoding:
+            override_shape_encoding = True
+
+            # for shapefiles, SHAPE_ENCODING must be set before opening the file
+            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
+            # SHAPE_ENCODING so that it can be restored later
+            # (we do this for all data sources where encoding is set because
+            # we don't know the driver until after it is opened, which is too late)
+            prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL)
+            if prev_shape_encoding_c != NULL:
+                # strings returned from config options may be replaced via
+                # CPLSetConfigOption() below; GDAL instructs us to save a copy
+                # in a new string
+                prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
+
+            # set empty string to disable auto-decoding by GDAL to UTF-8
+            CPLSetConfigOption("SHAPE_ENCODING", "")
+
+
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
@@ -1566,6 +1645,13 @@ def ogr_read_info(
             GDALClose(ogr_dataset)
             ogr_dataset = NULL
 
+        # reset SHAPE_ENCODING config parameter if temporarily set above
+        if override_shape_encoding:
+            CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+
+            if prev_shape_encoding != NULL:
+                CPLFree(prev_shape_encoding)
+
     return meta
 
 
@@ -1600,7 +1686,7 @@ cdef str get_default_layer(OGRDataSourceH ogr_dataset):
     -------
     str
         the name of the default layer to be read.
-    
+
     """
     layers = get_layer_names(ogr_dataset)
     first_layer_name = layers[0][0]
@@ -1632,7 +1718,7 @@ cdef get_layer_names(OGRDataSourceH ogr_dataset):
     -------
     ndarray(n)
         array of layer names
-    
+
     """
     cdef OGRLayerH ogr_layer = NULL
 
diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd
index 35fbd29a..e62241dc 100644
--- a/pyogrio/_ogr.pxd
+++ b/pyogrio/_ogr.pxd
@@ -12,6 +12,7 @@ cdef extern from "cpl_conv.h":
     const char* CPLFindFile(const char *pszClass, const char *filename)
     const char* CPLGetConfigOption(const char* key, const char* value)
     void        CPLSetConfigOption(const char* key, const char* value)
+    char*       CPLStrdup(const char* string)
 
 
 cdef extern from "cpl_error.h" nogil:

From 2b38f9466d92fa2f33c9defc102f9872a9d05afb Mon Sep 17 00:00:00 2001
From: "Brendan C. Ward" <bcward@astutespruce.com>
Date: Thu, 4 Apr 2024 21:18:03 -0700
Subject: [PATCH 02/21] Correctly handle I/O for non-UTF-8 shapefiles

---
 CHANGES.md                   |  4 +++-
 pyogrio/_io.pyx              | 20 ++++++++++++------
 pyogrio/tests/test_raw_io.py | 39 ++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 7086b1e9..5c944663 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -9,7 +9,6 @@
     including the CRS, when using GDAL 3.8 or higher (#366).
 -   Warn when reading from a multilayer file without specifying a layer (#362).
 
-
 ### Bug fixes
 
 -   Fix error in `write_dataframe` if input has a date column and
@@ -19,6 +18,9 @@
 -   Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if
     a boolean column is detected due to error in GDAL reading boolean values (#335)
     this has been fixed in GDAL >= 3.8.3.
+-   Disable GDAL's auto-decoding of ESRI Shapefiles to `UTF-8` in order to enable
+    user-provided `encoding` option to work properly for `read`, `read_dataframe`, and `open_arrow`, and correctly encode Shapefile field names and text values to the
+    user-provided `encoding` (#384).
 
 ### Packaging
 
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index d4aa7b8a..04c9190e 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -1948,8 +1948,9 @@ cdef create_ogr_dataset_layer(
         # Setup layer creation options
 
         if driver == 'ESRI Shapefile':
-            # Fiona only sets encoding for shapefiles; other drivers do not support
-            # encoding as an option.
+            # ENCODING option must be set for shapefiles to properly write *.cpg
+            # file containing the encoding; this is not a supported option for
+            # other drivers
             if encoding is None:
                 encoding = "UTF-8"
             encoding_b = encoding.upper().encode('UTF-8')
@@ -2074,10 +2075,17 @@ def ogr_write(
         &ogr_dataset, &ogr_layer,
     )
 
-    # Now the dataset and layer have been created, we can properly determine the
-    # encoding. It is derived from the user, from the dataset capabilities / type,
-    # or from the system locale
-    encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+
+    if driver == 'ESRI Shapefile':
+        # force encoding for remaining operations to be in UTF-8 because
+        # GDAL will automatically convert those to the target encoding
+        encoding = "UTF-8"
+
+    else:
+        # Now the dataset and layer have been created, we can properly determine the
+        # encoding. It is derived from the user, from the dataset capabilities / type,
+        # or from the system locale
+        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
     ### Create the fields
     field_types = None
diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py
index 362c5773..8f711338 100644
--- a/pyogrio/tests/test_raw_io.py
+++ b/pyogrio/tests/test_raw_io.py
@@ -1168,6 +1168,45 @@ def test_encoding_io_shapefile(tmp_path, read_encoding, write_encoding):
         )
 
 
+def test_non_utf8_encoding_shapefile(tmp_path):
+    encoding = "CP936"
+
+    # Point(0, 0)
+    geometry = np.array(
+        [bytes.fromhex("010100000000000000000000000000000000000000")], dtype=object
+    )
+
+    mandarin = "中文"
+    field_data = [np.array([mandarin], dtype=object)]
+
+    fields = [mandarin]
+    meta = dict(geometry_type="Point", crs="EPSG:4326", encoding=encoding)
+
+    filename = tmp_path / "test.shp"
+    # NOTE: GDAL automatically creates a cpg file with the encoding name, which
+    # means that if we read this without specifying the encoding it uses the
+    # correct one
+    write(filename, geometry, field_data, fields, **meta)
+
+    actual_meta, _, _, actual_field_data = read(filename)
+    assert np.array_equal(fields, actual_meta["fields"])
+    assert np.array_equal(field_data, actual_field_data)
+    assert np.array_equal(fields, read_info(filename)["fields"])
+
+    # verify that if cpg file is not present, that user-provided encoding must be used
+    os.unlink(str(filename).replace(".shp", ".cpg"))
+
+    bad_meta, _, _, bad_field_data = read(filename)
+    assert not np.array_equal(fields, bad_meta["fields"])
+    assert not np.array_equal(field_data, bad_field_data)
+    assert not np.array_equal(fields, read_info(filename)["fields"])
+
+    actual_meta, _, _, actual_field_data = read(filename, encoding=encoding)
+    assert np.array_equal(fields, actual_meta["fields"])
+    assert np.array_equal(field_data, actual_field_data)
+    assert np.array_equal(fields, read_info(filename, encoding=encoding)["fields"])
+
+
 def test_write_with_mask(tmp_path):
     # Point(0, 0), null
     geometry = np.array(

From e738f7b8aec3ca540c05ece7f022cbe087510e73 Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Fri, 5 Apr 2024 18:12:29 -0700
Subject: [PATCH 03/21] Use thread local config, add more tests

---
 CHANGES.md                         |  6 +--
 pyogrio/_io.pyx                    | 56 ++++++++++++++++--------
 pyogrio/_ogr.pxd                   |  2 +
 pyogrio/geopandas.py               |  3 +-
 pyogrio/tests/test_geopandas_io.py | 69 ++++++++++++++++++++++++++++++
 pyogrio/tests/test_raw_io.py       | 66 ++++++++++++++++++++++------
 6 files changed, 169 insertions(+), 33 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 5c944663..6740bd3b 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -18,9 +18,9 @@
 -   Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if
     a boolean column is detected due to error in GDAL reading boolean values (#335)
     this has been fixed in GDAL >= 3.8.3.
--   Disable GDAL's auto-decoding of ESRI Shapefiles to `UTF-8` in order to enable
-    user-provided `encoding` option to work properly for `read`, `read_dataframe`, and `open_arrow`, and correctly encode Shapefile field names and text values to the
-    user-provided `encoding` (#384).
+-   Properly handle decoding of ESRI Shapefiles with user-provided `encoding`
+    option for `read`, `read_dataframe`, and `open_arrow`, and correctly encode
+    Shapefile field names and text values to the user-provided `encoding` (#384).
 
 ### Packaging
 
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index 04c9190e..852adadd 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -1153,15 +1153,17 @@ def ogr_read(
             # SHAPE_ENCODING so that it can be restored later
             # (we do this for all data sources where encoding is set because
             # we don't know the driver until after it is opened, which is too late)
-            prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL)
+            prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL)
             if prev_shape_encoding_c != NULL:
                 # strings returned from config options may be replaced via
                 # CPLSetConfigOption() below; GDAL instructs us to save a copy
                 # in a new string
                 prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
 
-            # set empty string to disable auto-decoding by GDAL to UTF-8
-            CPLSetConfigOption("SHAPE_ENCODING", "")
+            # set encoding used automatically by GDAL
+            encoding_b = encoding.encode('UTF-8')
+            encoding_c = encoding_b
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", <const char*>encoding_c)
 
 
         dataset_options = dict_to_options(dataset_kwargs)
@@ -1178,8 +1180,13 @@ def ogr_read(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        # FIXME: it appears we need to set ENCODING sooner
-        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+        if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
+            # Because SHAPE_ENCODING is set above, GDAL will automatically decode
+            # to UTF-8
+            encoding = "UTF-8"
+
+        else:
+            encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding)
 
@@ -1274,7 +1281,7 @@ def ogr_read(
 
         # reset SHAPE_ENCODING config parameter if temporarily set above
         if override_shape_encoding:
-            CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
 
             if prev_shape_encoding != NULL:
                 CPLFree(prev_shape_encoding)
@@ -1369,16 +1376,17 @@ def ogr_open_arrow(
             # SHAPE_ENCODING so that it can be restored later
             # (we do this for all data sources where encoding is set because
             # we don't know the driver until after it is opened, which is too late)
-            prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL)
+            prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL)
             if prev_shape_encoding_c != NULL:
                 # strings returned from config options may be replaced via
                 # CPLSetConfigOption() below; GDAL instructs us to save a copy
                 # in a new string
                 prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
 
-            # set empty string to disable auto-decoding by GDAL to UTF-8
-            CPLSetConfigOption("SHAPE_ENCODING", "")
-
+            # set encoding used automatically by GDAL
+            encoding_b = encoding.encode('UTF-8')
+            encoding_c = encoding_b
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", <const char*>encoding_c)
 
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
@@ -1394,7 +1402,13 @@ def ogr_open_arrow(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+        if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
+            # Because SHAPE_ENCODING is set above, GDAL will automatically decode
+            # to UTF-8
+            encoding = "UTF-8"
+
+        else:
+            encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding, use_arrow=True)
 
@@ -1504,7 +1518,7 @@ def ogr_open_arrow(
 
         # reset SHAPE_ENCODING config parameter if temporarily set above
         if override_shape_encoding:
-            CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
 
             if prev_shape_encoding != NULL:
                 CPLFree(prev_shape_encoding)
@@ -1592,15 +1606,17 @@ def ogr_read_info(
             # SHAPE_ENCODING so that it can be restored later
             # (we do this for all data sources where encoding is set because
             # we don't know the driver until after it is opened, which is too late)
-            prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL)
+            prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL)
             if prev_shape_encoding_c != NULL:
                 # strings returned from config options may be replaced via
                 # CPLSetConfigOption() below; GDAL instructs us to save a copy
                 # in a new string
                 prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
 
-            # set empty string to disable auto-decoding by GDAL to UTF-8
-            CPLSetConfigOption("SHAPE_ENCODING", "")
+            # set encoding used automatically by GDAL
+            encoding_b = encoding.encode('UTF-8')
+            encoding_c = encoding_b
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", <const char*>encoding_c)
 
 
         dataset_options = dict_to_options(dataset_kwargs)
@@ -1612,7 +1628,13 @@ def ogr_read_info(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+        if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
+            # Because SHAPE_ENCODING is set above, GDAL will automatically decode
+            # to UTF-8
+            encoding = "UTF-8"
+
+        else:
+            encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding)
 
@@ -1647,7 +1669,7 @@ def ogr_read_info(
 
         # reset SHAPE_ENCODING config parameter if temporarily set above
         if override_shape_encoding:
-            CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding)
+            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
 
             if prev_shape_encoding != NULL:
                 CPLFree(prev_shape_encoding)
diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd
index e62241dc..1fd3772e 100644
--- a/pyogrio/_ogr.pxd
+++ b/pyogrio/_ogr.pxd
@@ -12,6 +12,8 @@ cdef extern from "cpl_conv.h":
     const char* CPLFindFile(const char *pszClass, const char *filename)
     const char* CPLGetConfigOption(const char* key, const char* value)
     void        CPLSetConfigOption(const char* key, const char* value)
+    const char* CPLGetThreadLocalConfigOption(const char* key, const char* value)
+    void        CPLSetThreadLocalConfigOption(const char* key, const char* value)
     char*       CPLStrdup(const char* string)
 
 
diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
index 1cbcb113..f647126e 100644
--- a/pyogrio/geopandas.py
+++ b/pyogrio/geopandas.py
@@ -349,7 +349,8 @@ def write_dataframe(
         attempts to infer driver from path.
     encoding : str, optional (default: None)
         If present, will be used as the encoding for writing string values to
-        the file.
+        the file.  Use with caution, only certain drivers support encodings
+        other than UTF-8.
     geometry_type : string, optional (default: None)
         By default, the geometry type of the layer will be inferred from the
         data, after applying the promote_to_multi logic. If the data only contains a
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 412b95b6..11c14f0a 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -1611,3 +1611,72 @@ def test_arrow_bool_exception(tmpdir):
         "the Arrow API",
     ):
         read_dataframe(filename, use_arrow=True)
+
+
+@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
+def test_non_utf8_encoding_io(tmp_path, ext):
+    """Verify that we write non-UTF data to the data source
+
+    IMPORTANT: this may not be valid for the data source and will likely render
+    them unusable in other tools, but should successfully roundtrip unless we
+    disable writing using other encodings.
+
+    NOTE: pyarrow cannot handle non-UTF-8 characters in this way
+    """
+    encoding = "CP936"
+
+    output_path = tmp_path / f"test.{ext}"
+
+    mandarin = "中文"
+    df = gp.GeoDataFrame(
+        {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326"
+    )
+    write_dataframe(df, output_path, encoding=encoding)
+
+    # cannot open these files without specifying encoding
+    with pytest.raises(UnicodeDecodeError):
+        read_dataframe(output_path)
+
+    # must provide encoding to read these properly
+    actual = read_dataframe(output_path, encoding=encoding)
+    assert actual.columns[0] == mandarin
+    assert actual[mandarin].values[0] == mandarin
+
+
+def test_non_utf8_encoding_io_shapefile(tmp_path, use_arrow):
+    encoding = "CP936"
+
+    output_path = tmp_path / "test.shp"
+
+    mandarin = "中文"
+    df = gp.GeoDataFrame(
+        {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326"
+    )
+    write_dataframe(df, output_path, encoding=encoding)
+
+    # NOTE: GDAL automatically creates a cpg file with the encoding name, which
+    # means that if we read this without specifying the encoding it uses the
+    # correct one
+    actual = read_dataframe(output_path, use_arrow=use_arrow)
+    assert actual.columns[0] == mandarin
+    assert actual[mandarin].values[0] == mandarin
+
+    # verify that if cpg file is not present, that user-provided encoding must be used
+    os.unlink(str(output_path).replace(".shp", ".cpg"))
+
+    # We will assume ISO-8859-1, which is wrong
+    miscoded = mandarin.encode("CP936").decode("ISO-8859-1")
+
+    if use_arrow:
+        # pyarrow cannot decode column name with incorrect encoding
+        with pytest.raises(UnicodeDecodeError):
+            read_dataframe(output_path, use_arrow=True)
+    else:
+        bad = read_dataframe(output_path, use_arrow=False)
+        assert bad.columns[0] == miscoded
+        assert bad[miscoded].values[0] == miscoded
+
+    # If encoding is provided, that should yield correct text
+    actual = read_dataframe(output_path, encoding=encoding, use_arrow=use_arrow)
+    assert actual.columns[0] == mandarin
+    assert actual[mandarin].values[0] == mandarin
diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py
index 8f711338..6b255462 100644
--- a/pyogrio/tests/test_raw_io.py
+++ b/pyogrio/tests/test_raw_io.py
@@ -1168,7 +1168,46 @@ def test_encoding_io_shapefile(tmp_path, read_encoding, write_encoding):
         )
 
 
-def test_non_utf8_encoding_shapefile(tmp_path):
+@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
+def test_non_utf8_encoding_io(tmp_path, ext):
+    """Verify that we write non-UTF data to the data source
+
+    IMPORTANT: this may not be valid for the data source and will likely render
+    them unusable in other tools, but should successfully roundtrip unless we
+    disable writing using other encodings.
+    """
+
+    encoding = "CP936"
+
+    # Point(0, 0)
+    geometry = np.array(
+        [bytes.fromhex("010100000000000000000000000000000000000000")], dtype=object
+    )
+
+    mandarin = "中文"
+    field_data = [np.array([mandarin], dtype=object)]
+
+    fields = [mandarin]
+    meta = dict(geometry_type="Point", crs="EPSG:4326", encoding=encoding)
+
+    filename = tmp_path / f"test.{ext}"
+    write(filename, geometry, field_data, fields, **meta)
+
+    # cannot open these files without specifying encoding
+    with pytest.raises(UnicodeDecodeError):
+        read(filename)
+
+    with pytest.raises(UnicodeDecodeError):
+        read_info(filename)
+
+    # must provide encoding to read these properly
+    actual_meta, _, _, actual_field_data = read(filename, encoding=encoding)
+    assert actual_meta["fields"][0] == mandarin
+    assert actual_field_data[0] == mandarin
+    assert read_info(filename, encoding=encoding)["fields"][0] == mandarin
+
+
+def test_non_utf8_encoding_io_shapefile(tmp_path):
     encoding = "CP936"
 
     # Point(0, 0)
@@ -1183,28 +1222,31 @@ def test_non_utf8_encoding_shapefile(tmp_path):
     meta = dict(geometry_type="Point", crs="EPSG:4326", encoding=encoding)
 
     filename = tmp_path / "test.shp"
+    write(filename, geometry, field_data, fields, **meta)
+
     # NOTE: GDAL automatically creates a cpg file with the encoding name, which
     # means that if we read this without specifying the encoding it uses the
     # correct one
-    write(filename, geometry, field_data, fields, **meta)
-
     actual_meta, _, _, actual_field_data = read(filename)
-    assert np.array_equal(fields, actual_meta["fields"])
-    assert np.array_equal(field_data, actual_field_data)
-    assert np.array_equal(fields, read_info(filename)["fields"])
+    assert actual_meta["fields"][0] == mandarin
+    assert actual_field_data[0] == mandarin
+    assert read_info(filename)["fields"][0] == mandarin
 
     # verify that if cpg file is not present, that user-provided encoding must be used
     os.unlink(str(filename).replace(".shp", ".cpg"))
 
+    # We will assume ISO-8859-1, which is wrong
     bad_meta, _, _, bad_field_data = read(filename)
-    assert not np.array_equal(fields, bad_meta["fields"])
-    assert not np.array_equal(field_data, bad_field_data)
-    assert not np.array_equal(fields, read_info(filename)["fields"])
+    miscoded = mandarin.encode("CP936").decode("ISO-8859-1")
+    assert bad_meta["fields"][0] == miscoded
+    assert bad_field_data[0] == miscoded
+    assert read_info(filename)["fields"][0] == miscoded
 
+    # If encoding is provided, that should yield correct text
     actual_meta, _, _, actual_field_data = read(filename, encoding=encoding)
-    assert np.array_equal(fields, actual_meta["fields"])
-    assert np.array_equal(field_data, actual_field_data)
-    assert np.array_equal(fields, read_info(filename, encoding=encoding)["fields"])
+    assert actual_meta["fields"][0] == mandarin
+    assert actual_field_data[0] == mandarin
+    assert read_info(filename, encoding=encoding)["fields"][0] == mandarin
 
 
 def test_write_with_mask(tmp_path):

From 3362daed71020373f7a7e0b239e9e6627d8f7ad7 Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Fri, 5 Apr 2024 20:42:45 -0700
Subject: [PATCH 04/21] Fix detection of encoding for shapefile layers via SQL

---
 pyogrio/_io.pyx                    | 48 +++++++++++++++++++++++-------
 pyogrio/tests/test_geopandas_io.py | 29 ++++++++++++++++++
 2 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index 852adadd..6cf4ef69 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -486,6 +486,9 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
         # read without recoding. Hence, it is up to you to supply the data in the
         # appropriate encoding. More info:
         # https://gdal.org/development/rfc/rfc23_ogr_unicode.html#oftstring-oftstringlist-fields
+        # NOTE: this always returns False for the layer returned by executing SQL,
+        # even when they indeed support UTF-8; the layer underying the SQL must
+        # be passed instead
         return "UTF-8"
 
     driver = get_driver(ogr_dataset)
@@ -1109,6 +1112,7 @@ def ogr_read(
     cdef char **fields_c = NULL
     cdef OGRDataSourceH ogr_dataset = NULL
     cdef OGRLayerH ogr_layer = NULL
+    cdef OGRLayerH base_layer = NULL
     cdef int feature_count = 0
     cdef double xmin, ymin, xmax, ymax
     cdef const char* prev_shape_encoding_c = NULL
@@ -1169,6 +1173,11 @@ def ogr_read(
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
+        if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
+            # Because SHAPE_ENCODING is set above, GDAL will automatically decode
+            # to UTF-8
+            encoding = "UTF-8"
+
         if sql is None:
             if layer is None:
                 layer = get_default_layer(ogr_dataset)
@@ -1180,13 +1189,22 @@ def ogr_read(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
-            # Because SHAPE_ENCODING is set above, GDAL will automatically decode
-            # to UTF-8
-            encoding = "UTF-8"
+        if encoding:
+            if get_driver(ogr_dataset) == "ESRI Shapefile":
+                # Because SHAPE_ENCODING is set above, GDAL will automatically decode
+                # to UTF-8; ignore any encoding set by user
+                encoding = "UTF-8"
 
         else:
-            encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+            if sql is not None and get_driver(ogr_dataset) == "ESRI Shapefile":
+                # in order to properly test certain capabilities, we need to have the
+                # underlying layer referenced by the SQL query
+                base_layer = GDALDatasetGetLayer(ogr_dataset, 0)
+
+            else:
+                base_layer = ogr_layer
+
+            encoding = detect_encoding(ogr_dataset, base_layer)
 
         fields = get_fields(ogr_layer, encoding)
 
@@ -1319,6 +1337,7 @@ def ogr_open_arrow(
     cdef const char *where_c = NULL
     cdef OGRDataSourceH ogr_dataset = NULL
     cdef OGRLayerH ogr_layer = NULL
+    cdef OGRLayerH base_layer = NULL
     cdef char **fields_c = NULL
     cdef const char *field_c = NULL
     cdef char **options = NULL
@@ -1402,13 +1421,22 @@ def ogr_open_arrow(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
-            # Because SHAPE_ENCODING is set above, GDAL will automatically decode
-            # to UTF-8
-            encoding = "UTF-8"
+        if encoding:
+            if get_driver(ogr_dataset) == "ESRI Shapefile":
+                # Because SHAPE_ENCODING is set above, GDAL will automatically decode
+                # to UTF-8; ignore any encoding set by user
+                encoding = "UTF-8"
 
         else:
-            encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+            if sql is not None and get_driver(ogr_dataset) == "ESRI Shapefile":
+                # in order to properly test certain capabilities, we need to have the
+                # underlying layer referenced by the SQL query
+                base_layer = GDALDatasetGetLayer(ogr_dataset, 0)
+
+            else:
+                base_layer = ogr_layer
+
+            encoding = detect_encoding(ogr_dataset, base_layer)
 
         fields = get_fields(ogr_layer, encoding, use_arrow=True)
 
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 11c14f0a..897b02ca 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -1680,3 +1680,32 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, use_arrow):
     actual = read_dataframe(output_path, encoding=encoding, use_arrow=use_arrow)
     assert actual.columns[0] == mandarin
     assert actual[mandarin].values[0] == mandarin
+
+
+def test_non_utf8_encoding_shapefile_sql(tmp_path, use_arrow):
+    encoding = "CP936"
+
+    output_path = tmp_path / "test.shp"
+
+    mandarin = "中文"
+    df = gp.GeoDataFrame(
+        {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326"
+    )
+    write_dataframe(df, output_path, encoding=encoding)
+
+    actual = read_dataframe(
+        output_path,
+        sql=f"select * from test where \"{mandarin}\" = '{mandarin}'",
+        use_arrow=use_arrow,
+    )
+    assert actual.columns[0] == mandarin
+    assert actual[mandarin].values[0] == mandarin
+
+    actual = read_dataframe(
+        output_path,
+        sql=f"select * from test where \"{mandarin}\" = '{mandarin}'",
+        encoding=encoding,
+        use_arrow=use_arrow,
+    )
+    assert actual.columns[0] == mandarin
+    assert actual[mandarin].values[0] == mandarin

From b99103819dae62d33e6849e6bc0f5cbf3b0d3ffd Mon Sep 17 00:00:00 2001
From: "Brendan C. Ward" <bcward@astutespruce.com>
Date: Sat, 6 Apr 2024 08:19:29 -0700
Subject: [PATCH 05/21] Expand encoding test cases

---
 pyogrio/tests/conftest.py          | 20 +++++++++++++++
 pyogrio/tests/test_geopandas_io.py | 32 ++++++++++--------------
 pyogrio/tests/test_raw_io.py       | 39 ++++++++++++++----------------
 3 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py
index 76327b4f..4efc9b0e 100644
--- a/pyogrio/tests/conftest.py
+++ b/pyogrio/tests/conftest.py
@@ -133,3 +133,23 @@ def test_datetime():
 @pytest.fixture(scope="session")
 def test_datetime_tz():
     return _data_dir / "test_datetime_tz.geojson"
+
+
+@pytest.fixture(
+    scope="session",
+    params=[
+        # Nordic
+        ("CP865", "Å"),
+        # Japanese
+        ("CP932", "ﾎ"),
+        # Chinese
+        ("CP936", "中文"),
+        # ANSI
+        ("CP1252", "ÿ"),
+        # Greek
+        ("CP1253", "Φ"),
+    ],
+)
+def encoded_text(request):
+    """Return tuple with encoding name and very short sample text in that encoding"""
+    return request.param
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 897b02ca..da7d9f1e 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -1614,7 +1614,7 @@ def test_arrow_bool_exception(tmpdir):
 
 
 @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
-def test_non_utf8_encoding_io(tmp_path, ext):
+def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
     """Verify that we write non-UTF data to the data source
 
     IMPORTANT: this may not be valid for the data source and will likely render
@@ -1623,14 +1623,11 @@ def test_non_utf8_encoding_io(tmp_path, ext):
 
     NOTE: pyarrow cannot handle non-UTF-8 characters in this way
     """
-    encoding = "CP936"
 
+    encoding, text = encoded_text
     output_path = tmp_path / f"test.{ext}"
 
-    mandarin = "中文"
-    df = gp.GeoDataFrame(
-        {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326"
-    )
+    df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
     write_dataframe(df, output_path, encoding=encoding)
 
     # cannot open these files without specifying encoding
@@ -1639,33 +1636,30 @@ def test_non_utf8_encoding_io(tmp_path, ext):
 
     # must provide encoding to read these properly
     actual = read_dataframe(output_path, encoding=encoding)
-    assert actual.columns[0] == mandarin
-    assert actual[mandarin].values[0] == mandarin
+    assert actual.columns[0] == text
+    assert actual[text].values[0] == text
 
 
-def test_non_utf8_encoding_io_shapefile(tmp_path, use_arrow):
-    encoding = "CP936"
+def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow):
+    encoding, text = encoded_text
 
     output_path = tmp_path / "test.shp"
 
-    mandarin = "中文"
-    df = gp.GeoDataFrame(
-        {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326"
-    )
+    df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
     write_dataframe(df, output_path, encoding=encoding)
 
     # NOTE: GDAL automatically creates a cpg file with the encoding name, which
     # means that if we read this without specifying the encoding it uses the
     # correct one
     actual = read_dataframe(output_path, use_arrow=use_arrow)
-    assert actual.columns[0] == mandarin
-    assert actual[mandarin].values[0] == mandarin
+    assert actual.columns[0] == text
+    assert actual[text].values[0] == text
 
     # verify that if cpg file is not present, that user-provided encoding must be used
     os.unlink(str(output_path).replace(".shp", ".cpg"))
 
     # We will assume ISO-8859-1, which is wrong
-    miscoded = mandarin.encode("CP936").decode("ISO-8859-1")
+    miscoded = text.encode(encoding).decode("ISO-8859-1")
 
     if use_arrow:
         # pyarrow cannot decode column name with incorrect encoding
@@ -1678,8 +1672,8 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, use_arrow):
 
     # If encoding is provided, that should yield correct text
     actual = read_dataframe(output_path, encoding=encoding, use_arrow=use_arrow)
-    assert actual.columns[0] == mandarin
-    assert actual[mandarin].values[0] == mandarin
+    assert actual.columns[0] == text
+    assert actual[text].values[0] == text
 
 
 def test_non_utf8_encoding_shapefile_sql(tmp_path, use_arrow):
diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py
index 6b255462..0387c368 100644
--- a/pyogrio/tests/test_raw_io.py
+++ b/pyogrio/tests/test_raw_io.py
@@ -1169,25 +1169,23 @@ def test_encoding_io_shapefile(tmp_path, read_encoding, write_encoding):
 
 
 @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
-def test_non_utf8_encoding_io(tmp_path, ext):
+def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
     """Verify that we write non-UTF data to the data source
 
     IMPORTANT: this may not be valid for the data source and will likely render
     them unusable in other tools, but should successfully roundtrip unless we
     disable writing using other encodings.
     """
-
-    encoding = "CP936"
+    encoding, text = encoded_text
 
     # Point(0, 0)
     geometry = np.array(
         [bytes.fromhex("010100000000000000000000000000000000000000")], dtype=object
     )
 
-    mandarin = "中文"
-    field_data = [np.array([mandarin], dtype=object)]
+    field_data = [np.array([text], dtype=object)]
 
-    fields = [mandarin]
+    fields = [text]
     meta = dict(geometry_type="Point", crs="EPSG:4326", encoding=encoding)
 
     filename = tmp_path / f"test.{ext}"
@@ -1202,23 +1200,22 @@ def test_non_utf8_encoding_io(tmp_path, ext):
 
     # must provide encoding to read these properly
     actual_meta, _, _, actual_field_data = read(filename, encoding=encoding)
-    assert actual_meta["fields"][0] == mandarin
-    assert actual_field_data[0] == mandarin
-    assert read_info(filename, encoding=encoding)["fields"][0] == mandarin
+    assert actual_meta["fields"][0] == text
+    assert actual_field_data[0] == text
+    assert read_info(filename, encoding=encoding)["fields"][0] == text
 
 
-def test_non_utf8_encoding_io_shapefile(tmp_path):
-    encoding = "CP936"
+def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text):
+    encoding, text = encoded_text
 
     # Point(0, 0)
     geometry = np.array(
         [bytes.fromhex("010100000000000000000000000000000000000000")], dtype=object
     )
 
-    mandarin = "中文"
-    field_data = [np.array([mandarin], dtype=object)]
+    field_data = [np.array([text], dtype=object)]
 
-    fields = [mandarin]
+    fields = [text]
     meta = dict(geometry_type="Point", crs="EPSG:4326", encoding=encoding)
 
     filename = tmp_path / "test.shp"
@@ -1228,25 +1225,25 @@ def test_non_utf8_encoding_io_shapefile(tmp_path):
     # means that if we read this without specifying the encoding it uses the
     # correct one
     actual_meta, _, _, actual_field_data = read(filename)
-    assert actual_meta["fields"][0] == mandarin
-    assert actual_field_data[0] == mandarin
-    assert read_info(filename)["fields"][0] == mandarin
+    assert actual_meta["fields"][0] == text
+    assert actual_field_data[0] == text
+    assert read_info(filename)["fields"][0] == text
 
     # verify that if cpg file is not present, that user-provided encoding must be used
     os.unlink(str(filename).replace(".shp", ".cpg"))
 
     # We will assume ISO-8859-1, which is wrong
     bad_meta, _, _, bad_field_data = read(filename)
-    miscoded = mandarin.encode("CP936").decode("ISO-8859-1")
+    miscoded = text.encode(encoding).decode("ISO-8859-1")
     assert bad_meta["fields"][0] == miscoded
     assert bad_field_data[0] == miscoded
     assert read_info(filename)["fields"][0] == miscoded
 
     # If encoding is provided, that should yield correct text
     actual_meta, _, _, actual_field_data = read(filename, encoding=encoding)
-    assert actual_meta["fields"][0] == mandarin
-    assert actual_field_data[0] == mandarin
-    assert read_info(filename, encoding=encoding)["fields"][0] == mandarin
+    assert actual_meta["fields"][0] == text
+    assert actual_field_data[0] == text
+    assert read_info(filename, encoding=encoding)["fields"][0] == text
 
 
 def test_write_with_mask(tmp_path):

From ecb56f678468b18c8382c5ab004b83bb1abd0aae Mon Sep 17 00:00:00 2001
From: "Brendan C. Ward" <bcward@astutespruce.com>
Date: Sat, 6 Apr 2024 13:35:28 -0700
Subject: [PATCH 06/21] Use ANSI code pages for alternative encodings (not DOS)
 and slightly improve docs

---
 docs/source/known_issues.md | 15 +++++++++++++--
 pyogrio/_io.pyx             |  4 ++--
 pyogrio/tests/conftest.py   | 14 ++++++++++----
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md
index 29d390ee..a1a90adf 100644
--- a/docs/source/known_issues.md
+++ b/docs/source/known_issues.md
@@ -41,10 +41,21 @@ geometries from the data layer.
 
 ## Character encoding
 
-Pyogrio supports reading / writing data layers with a defined encoding. However,
+Pyogrio supports reading / writing data layers with a defined encoding. Where
+possible and the `encoding` option is not specified, GDAL will attempt to
+automatically decode from the native encoding to `UTF-8`, and pyogrio will report
+that the encoding is `UTF-8` in that case instead of the native encoding. For
+[ESRI Shapefiles](https://gdal.org/drivers/vector/shapefile.html#encoding),
+GDAL will use the associated `.cpg` file or a code page specified in the `.dbf`
+file to infer the native encoding, but may incorrectly assume the native encoding
+is `ISO-8859-1` leading to miscoding errors. Most other drivers are assumed to
+be in `UTF-8`, but it is possible (in theory) to specify the `encoding` parameter
+manually to force conversions to use the specified encoding value.
+
+However,
 DataFrames do not currently allow arbitrary metadata, which means that we are
 currently unable to store encoding information for a data source. Text fields
-are read into Python UTF-8 strings.
+are read into Python `UTF-8` strings.
 
 ## No validation of geometry or field types
 
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index 6cf4ef69..84cf4bb0 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -1549,7 +1549,7 @@ def ogr_open_arrow(
             CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
 
             if prev_shape_encoding != NULL:
-                CPLFree(prev_shape_encoding)
+                CPLFree(<void*>prev_shape_encoding)
 
 def ogr_read_bounds(
     str path,
@@ -1700,7 +1700,7 @@ def ogr_read_info(
             CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
 
             if prev_shape_encoding != NULL:
-                CPLFree(prev_shape_encoding)
+                CPLFree(<void*>prev_shape_encoding)
 
     return meta
 
diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py
index 4efc9b0e..766d1bed 100644
--- a/pyogrio/tests/conftest.py
+++ b/pyogrio/tests/conftest.py
@@ -138,18 +138,24 @@ def test_datetime_tz():
 @pytest.fixture(
     scope="session",
     params=[
-        # Nordic
-        ("CP865", "Å"),
         # Japanese
         ("CP932", "ﾎ"),
         # Chinese
         ("CP936", "中文"),
-        # ANSI
+        # Central European
+        ("CP1250", "Đ"),
+        # Latin 1 / Western European
         ("CP1252", "ÿ"),
         # Greek
         ("CP1253", "Φ"),
+        # Arabic
+        ("CP1256", "ش"),
     ],
 )
 def encoded_text(request):
-    """Return tuple with encoding name and very short sample text in that encoding"""
+    """Return tuple with encoding name and very short sample text in that encoding
+    NOTE: it was determined through testing that code pages for MS-DOS do not
+    consistently work across all Python installations (in particular, fail with conda),
+    but ANSI code pages appear to work properly.
+    """
     return request.param

From 994be50f0bf1a5c72d10e3f65002d7394263fcc5 Mon Sep 17 00:00:00 2001
From: "Brendan C. Ward" <bcward@astutespruce.com>
Date: Sat, 6 Apr 2024 13:48:56 -0700
Subject: [PATCH 07/21] Improve docs a little more

---
 docs/source/introduction.md | 18 +++++++++++-------
 docs/source/known_issues.md |  7 ++-----
 pyogrio/core.py             |  4 ++++
 pyogrio/geopandas.py        |  4 ++--
 pyogrio/raw.py              |  4 ++--
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/docs/source/introduction.md b/docs/source/introduction.md
index e7ac8cf1..c62357e1 100644
--- a/docs/source/introduction.md
+++ b/docs/source/introduction.md
@@ -89,6 +89,10 @@ first layer.
 }
 ```
 
+NOTE: pyogrio will report `UTF-8` if either the native encoding is likely to be
+`UTF-8` or GDAL can automatically convert from the detected native encoding to
+`UTF-8`.
+
 To read from a layer using name or index (the following are equivalent):
 
 ```python
@@ -468,9 +472,9 @@ You can also read from a URL with this syntax:
 
 GDAL only supports datetimes at a millisecond resolution. Reading data will thus
 give at most millisecond resolution (`datetime64[ms]` data type). With pandas 2.0
-`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]` 
-correspondingly. For previous versions of pandas, `datetime64[ns]` is used as 
-ms precision was not supported. When writing, only precision up to 
+`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]`
+correspondingly. For previous versions of pandas, `datetime64[ns]` is used as
+ms precision was not supported. When writing, only precision up to
 ms is retained.
 
 Not all file formats have dedicated support to store datetime data, like ESRI
@@ -478,11 +482,11 @@ Shapefile. For such formats, or if you require precision > ms, a workaround is t
 convert the datetimes to string.
 
 Timezone information is preserved where possible, however GDAL only represents
-time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or 
-`zoneinfo`). This means that dataframes with columns containing multiple offsets 
+time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or
+`zoneinfo`). This means that dataframes with columns containing multiple offsets
 (e.g. when switching from standard time to summer time) will be written correctly,
-but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime 
-column, as there is no way to reconstruct the original timezone from the individual 
+but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime
+column, as there is no way to reconstruct the original timezone from the individual
 offsets present.
 
 ## Dataset and layer creation options
diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md
index a1a90adf..1a26efac 100644
--- a/docs/source/known_issues.md
+++ b/docs/source/known_issues.md
@@ -48,14 +48,11 @@ that the encoding is `UTF-8` in that case instead of the native encoding. For
 [ESRI Shapefiles](https://gdal.org/drivers/vector/shapefile.html#encoding),
 GDAL will use the associated `.cpg` file or a code page specified in the `.dbf`
 file to infer the native encoding, but may incorrectly assume the native encoding
-is `ISO-8859-1` leading to miscoding errors. Most other drivers are assumed to
+is `ISO-8859-1`, leading to miscoding errors. Most other drivers are assumed to
 be in `UTF-8`, but it is possible (in theory) to specify the `encoding` parameter
 manually to force conversions to use the specified encoding value.
 
-However,
-DataFrames do not currently allow arbitrary metadata, which means that we are
-currently unable to store encoding information for a data source. Text fields
-are read into Python `UTF-8` strings.
+Field names and values are read into Python `UTF-8` strings.
 
 ## No validation of geometry or field types
 
diff --git a/pyogrio/core.py b/pyogrio/core.py
index 2d43397d..a354b10d 100644
--- a/pyogrio/core.py
+++ b/pyogrio/core.py
@@ -217,6 +217,10 @@ def read_info(
     driver or if the data source is nonspatial. You can force it to be calculated using
     the ``force_total_bounds`` parameter.
 
+    ``encoding`` will be ``UTF-8`` if either the native encoding is likely to be
+    ``UTF-8`` or GDAL can automatically convert from the detected native encoding
+    to ``UTF-8``.
+
     Parameters
     ----------
     path : str or pathlib.Path
diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
index f647126e..afbdccef 100644
--- a/pyogrio/geopandas.py
+++ b/pyogrio/geopandas.py
@@ -106,8 +106,8 @@ def read_dataframe(
         of the layer in the data source.  Defaults to first layer in data source.
     encoding : str, optional (default: None)
         If present, will be used as the encoding for reading string values from
-        the data source, unless encoding can be inferred directly from the data
-        source.
+        the data source.  By default will automatically try to detect the native
+        encoding and decode to ``UTF-8``.
     columns : list-like, optional (default: all columns)
         List of column names to import from the data source.  Column names must
         exactly match the names in the data source, and will be returned in
diff --git a/pyogrio/raw.py b/pyogrio/raw.py
index 6499867a..d82e4611 100644
--- a/pyogrio/raw.py
+++ b/pyogrio/raw.py
@@ -67,8 +67,8 @@ def read(
         of the layer in the data source.  Defaults to first layer in data source.
     encoding : str, optional (default: None)
         If present, will be used as the encoding for reading string values from
-        the data source, unless encoding can be inferred directly from the data
-        source.
+        the data source.  By default will automatically try to detect the native
+        encoding and decode to ``UTF-8``.
     columns : list-like, optional (default: all columns)
         List of column names to import from the data source.  Column names must
         exactly match the names in the data source, and will be returned in

From 901320fb3d699438782406f36188b79e9c2e94ea Mon Sep 17 00:00:00 2001
From: "Brendan C. Ward" <bcward@astutespruce.com>
Date: Mon, 8 Apr 2024 06:48:04 -0700
Subject: [PATCH 08/21] consolidate operations per PR feedback

---
 pyogrio/_io.pyx | 138 +++++++++++++++++++++++-------------------------
 1 file changed, 66 insertions(+), 72 deletions(-)

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index 84cf4bb0..0db563d5 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -134,6 +134,34 @@ cdef char** dict_to_options(object values):
     return options
 
 
+cdef const char* override_threadlocal_config_option(const char* key, const char* value):
+    """Set the CPLSetThreadLocalConfigOption for key=value
+
+    Parameters
+    ----------
+    key : const char*
+        python string must decoded to UTF-8 before calling this
+    value : const char*
+        python string must decoded to UTF-8 before calling this
+
+    Returns
+    -------
+    const char*
+        value previously set for key, so that it can be later restored.  Caller
+        is responsible for freeing this via CPLFree() if not NULL.
+    """
+    cdef const char *prev_value = CPLGetThreadLocalConfigOption(key, NULL)
+    if prev_value != NULL:
+        # strings returned from config options may be replaced via
+        # CPLSetConfigOption() below; GDAL instructs us to save a copy
+        # in a new string
+        prev_value = CPLStrdup(prev_value)
+
+    CPLSetThreadLocalConfigOption(key, value)
+
+    return prev_value
+
+
 cdef void* ogr_open(const char* path_c, int mode, char** options) except NULL:
     cdef void* ogr_dataset = NULL
 
@@ -486,16 +514,27 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
         # read without recoding. Hence, it is up to you to supply the data in the
         # appropriate encoding. More info:
         # https://gdal.org/development/rfc/rfc23_ogr_unicode.html#oftstring-oftstringlist-fields
-        # NOTE: this always returns False for the layer returned by executing SQL,
-        # even when they indeed support UTF-8; the layer underying the SQL must
-        # be passed instead
+        # NOTE: for shapefiles, this always returns False for the layer returned
+        # when executing SQL, even when it supports UTF-8 (patched below);
+        # this may be fixed by https://github.com/OSGeo/gdal/pull/9649 (GDAL >=3.9.0?)
         return "UTF-8"
 
     driver = get_driver(ogr_dataset)
     if driver == "ESRI Shapefile":
-        # Typically, OGR_L_TestCapability returns True for OLCStringsAsUTF8 for ESRI
-        # Shapefile so this won't be reached. However, for old versions of GDAL and/or
-        # if libraries are missing, this fallback could be needed.
+        # OGR_L_TestCapability returns True for OLCStringsAsUTF8 (above) for
+        # shapefiles when a .cpg file is present with a valid encoding, or GDAL
+        # auto-detects the encoding from the code page of the .dbf file, or
+        # SHAPE_ENCODING config option is set, or ENCODING layer creation option
+        # is specified (shapefiles only).  Otherwise, we can only assume that
+        # shapefiles are in their default encoding of ISO-8859-1 (which may be
+        # incorrect and must be overridden by user-provided encoding)
+
+        # Always use the first layer to test capabilities until detection for
+        # SQL results from shapefiles are fixed (above)
+        # This block should only be used for unfixed versions of GDAL (<3.9.0?)
+        if OGR_L_TestCapability(GDALDatasetGetLayer(ogr_dataset, 0), OLCStringsAsUTF8):
+            return "UTF-8"
+
         return "ISO-8859-1"
 
     if driver == "OSM":
@@ -1112,11 +1151,9 @@ def ogr_read(
     cdef char **fields_c = NULL
     cdef OGRDataSourceH ogr_dataset = NULL
     cdef OGRLayerH ogr_layer = NULL
-    cdef OGRLayerH base_layer = NULL
     cdef int feature_count = 0
     cdef double xmin, ymin, xmax, ymax
-    cdef const char* prev_shape_encoding_c = NULL
-    cdef char* prev_shape_encoding = NULL
+    cdef const char *prev_shape_encoding = NULL
     cdef bint override_shape_encoding = False
 
     path_b = path.encode('utf-8')
@@ -1157,18 +1194,9 @@ def ogr_read(
             # SHAPE_ENCODING so that it can be restored later
             # (we do this for all data sources where encoding is set because
             # we don't know the driver until after it is opened, which is too late)
-            prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL)
-            if prev_shape_encoding_c != NULL:
-                # strings returned from config options may be replaced via
-                # CPLSetConfigOption() below; GDAL instructs us to save a copy
-                # in a new string
-                prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
-
-            # set encoding used automatically by GDAL
-            encoding_b = encoding.encode('UTF-8')
+            encoding_b = encoding.encode("UTF-8")
             encoding_c = encoding_b
-            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", <const char*>encoding_c)
-
+            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c)
 
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
@@ -1191,20 +1219,12 @@ def ogr_read(
         # or from the system locale
         if encoding:
             if get_driver(ogr_dataset) == "ESRI Shapefile":
-                # Because SHAPE_ENCODING is set above, GDAL will automatically decode
-                # to UTF-8; ignore any encoding set by user
+                # Because SHAPE_ENCODING is set above, GDAL will automatically
+                # decode shapefiles to UTF-8; ignore any encoding set by user
                 encoding = "UTF-8"
 
         else:
-            if sql is not None and get_driver(ogr_dataset) == "ESRI Shapefile":
-                # in order to properly test certain capabilities, we need to have the
-                # underlying layer referenced by the SQL query
-                base_layer = GDALDatasetGetLayer(ogr_dataset, 0)
-
-            else:
-                base_layer = ogr_layer
-
-            encoding = detect_encoding(ogr_dataset, base_layer)
+            encoding = detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding)
 
@@ -1302,7 +1322,7 @@ def ogr_read(
             CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
 
             if prev_shape_encoding != NULL:
-                CPLFree(prev_shape_encoding)
+                CPLFree(<void*>prev_shape_encoding)
 
     return (
         meta,
@@ -1337,12 +1357,10 @@ def ogr_open_arrow(
     cdef const char *where_c = NULL
     cdef OGRDataSourceH ogr_dataset = NULL
     cdef OGRLayerH ogr_layer = NULL
-    cdef OGRLayerH base_layer = NULL
     cdef char **fields_c = NULL
     cdef const char *field_c = NULL
     cdef char **options = NULL
-    cdef const char* prev_shape_encoding_c = NULL
-    cdef char* prev_shape_encoding = NULL
+    cdef const char *prev_shape_encoding = NULL
     cdef bint override_shape_encoding = False
     cdef ArrowArrayStream stream
     cdef ArrowSchema schema
@@ -1395,17 +1413,9 @@ def ogr_open_arrow(
             # SHAPE_ENCODING so that it can be restored later
             # (we do this for all data sources where encoding is set because
             # we don't know the driver until after it is opened, which is too late)
-            prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL)
-            if prev_shape_encoding_c != NULL:
-                # strings returned from config options may be replaced via
-                # CPLSetConfigOption() below; GDAL instructs us to save a copy
-                # in a new string
-                prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
-
-            # set encoding used automatically by GDAL
-            encoding_b = encoding.encode('UTF-8')
+            encoding_b = encoding.encode("UTF-8")
             encoding_c = encoding_b
-            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", <const char*>encoding_c)
+            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c)
 
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
@@ -1423,20 +1433,12 @@ def ogr_open_arrow(
         # or from the system locale
         if encoding:
             if get_driver(ogr_dataset) == "ESRI Shapefile":
-                # Because SHAPE_ENCODING is set above, GDAL will automatically decode
-                # to UTF-8; ignore any encoding set by user
+                # Because SHAPE_ENCODING is set above, GDAL will automatically
+                # decode shapefiles to UTF-8; ignore any encoding set by user
                 encoding = "UTF-8"
 
         else:
-            if sql is not None and get_driver(ogr_dataset) == "ESRI Shapefile":
-                # in order to properly test certain capabilities, we need to have the
-                # underlying layer referenced by the SQL query
-                base_layer = GDALDatasetGetLayer(ogr_dataset, 0)
-
-            else:
-                base_layer = ogr_layer
-
-            encoding = detect_encoding(ogr_dataset, base_layer)
+            encoding = detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding, use_arrow=True)
 
@@ -1619,7 +1621,7 @@ def ogr_read_info(
     cdef char **dataset_options = NULL
     cdef OGRDataSourceH ogr_dataset = NULL
     cdef OGRLayerH ogr_layer = NULL
-    cdef const char* prev_shape_encoding = NULL
+    cdef const char *prev_shape_encoding = NULL
     cdef bint override_shape_encoding = False
 
     path_b = path.encode('utf-8')
@@ -1634,18 +1636,9 @@ def ogr_read_info(
             # SHAPE_ENCODING so that it can be restored later
             # (we do this for all data sources where encoding is set because
             # we don't know the driver until after it is opened, which is too late)
-            prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL)
-            if prev_shape_encoding_c != NULL:
-                # strings returned from config options may be replaced via
-                # CPLSetConfigOption() below; GDAL instructs us to save a copy
-                # in a new string
-                prev_shape_encoding = CPLStrdup(prev_shape_encoding_c)
-
-            # set encoding used automatically by GDAL
-            encoding_b = encoding.encode('UTF-8')
+            encoding_b = encoding.encode("UTF-8")
             encoding_c = encoding_b
-            CPLSetThreadLocalConfigOption("SHAPE_ENCODING", <const char*>encoding_c)
-
+            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c)
 
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
@@ -1657,8 +1650,8 @@ def ogr_read_info(
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
         if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
-            # Because SHAPE_ENCODING is set above, GDAL will automatically decode
-            # to UTF-8
+            # Because SHAPE_ENCODING is set above, GDAL will automatically
+            # decode shapefiles to UTF-8; ignore any encoding set by user
             encoding = "UTF-8"
 
         else:
@@ -2127,8 +2120,9 @@ def ogr_write(
 
 
     if driver == 'ESRI Shapefile':
-        # force encoding for remaining operations to be in UTF-8 because
-        # GDAL will automatically convert those to the target encoding
+        # force encoding for remaining operations to be in UTF-8 (even if user
+        # provides an encoding) because GDAL will automatically convert those to
+        # the target encoding because ENCODING is set as a layer creation option
         encoding = "UTF-8"
 
     else:

From 4313fcaab1226d20456428ebff84d08285a1b3fa Mon Sep 17 00:00:00 2001
From: "Brendan C. Ward" <bcward@astutespruce.com>
Date: Mon, 8 Apr 2024 08:35:37 -0700
Subject: [PATCH 09/21] verify SHAPE_ENCODING global option is retained

---
 pyogrio/tests/test_raw_io.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py
index 0387c368..58c73153 100644
--- a/pyogrio/tests/test_raw_io.py
+++ b/pyogrio/tests/test_raw_io.py
@@ -12,6 +12,7 @@
     list_drivers,
     read_info,
     set_gdal_config_options,
+    get_gdal_config_option,
     __gdal_version__,
 )
 from pyogrio._compat import HAS_SHAPELY
@@ -1245,6 +1246,17 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text):
     assert actual_field_data[0] == text
     assert read_info(filename, encoding=encoding)["fields"][0] == text
 
+    # verify that setting encoding does not corrupt SHAPE_ENCODING option if set
+    # globally (it is ignored during read when encoding is specified by user)
+    try:
+        set_gdal_config_options({"SHAPE_ENCODING": "CP1254"})
+        _ = read(filename, encoding=encoding)
+        assert get_gdal_config_option("SHAPE_ENCODING") == "CP1254"
+
+    finally:
+        # reset to clear between tests
+        set_gdal_config_options({"SHAPE_ENCODING": None})
+
 
 def test_write_with_mask(tmp_path):
     # Point(0, 0), null

From ef602d51e05f3de8170779f61d0d62dfda18af8a Mon Sep 17 00:00:00 2001
From: "Brendan C. Ward" <bcward@astutespruce.com>
Date: Mon, 8 Apr 2024 14:42:12 -0700
Subject: [PATCH 10/21] Cleanup duplicate override to UTF-8

---
 pyogrio/_io.pyx | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index 9beda426..251b0038 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -1201,11 +1201,6 @@ def ogr_read(
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
 
-        if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
-            # Because SHAPE_ENCODING is set above, GDAL will automatically decode
-            # to UTF-8
-            encoding = "UTF-8"
-
         if sql is None:
             if layer is None:
                 layer = get_default_layer(ogr_dataset)

From 2bdf4dc2e3261efb6df31bb82fe90cbed9365799 Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Wed, 17 Apr 2024 14:41:31 -0700
Subject: [PATCH 11/21] prevent combining encoding parameter and ENCODING open
 / creation option

---
 CHANGES.md                         |  3 +-
 pyogrio/_io.pyx                    | 30 +++++++++++----
 pyogrio/tests/test_geopandas_io.py | 59 ++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 8 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 4e311c3c..d84a87ce 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -33,7 +33,8 @@
     the data source not using the Arrow API (#391).
 -   Properly handle decoding of ESRI Shapefiles with user-provided `encoding`
     option for `read`, `read_dataframe`, and `open_arrow`, and correctly encode
-    Shapefile field names and text values to the user-provided `encoding` (#384).
+    Shapefile field names and text values to the user-provided `encoding` for
+    `write` and `write_dataframe` (#384).
 
 ### Packaging
 
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index 5b713b73..5de43e59 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -1216,6 +1216,10 @@ def ogr_read(
         # or from the system locale
         if encoding:
             if get_driver(ogr_dataset) == "ESRI Shapefile":
+                # NOTE: SHAPE_ENCODING is a configuration option whereas ENCODING is the dataset open option
+                if "ENCODING" in dataset_kwargs:
+                    raise ValueError('cannot provide both encoding parameter and "ENCODING" option; use encoding parameter to specify correct encoding for data source')
+
                 # Because SHAPE_ENCODING is set above, GDAL will automatically
                 # decode shapefiles to UTF-8; ignore any encoding set by user
                 encoding = "UTF-8"
@@ -1467,10 +1471,17 @@ def ogr_open_arrow(
         # or from the system locale
         if encoding:
             if get_driver(ogr_dataset) == "ESRI Shapefile":
+                # NOTE: SHAPE_ENCODING is a configuration option whereas ENCODING is the dataset open option
+                if "ENCODING" in dataset_kwargs:
+                    raise ValueError('cannot provide both encoding parameter and "ENCODING" option; use encoding parameter to specify correct encoding for data source')
+
                 # Because SHAPE_ENCODING is set above, GDAL will automatically
                 # decode shapefiles to UTF-8; ignore any encoding set by user
                 encoding = "UTF-8"
 
+            elif encoding.replace('-','').upper() != 'UTF8':
+                raise ValueError("non-UTF-8 encoding is not supported for Arrow; use the non-Arrow interface instead")
+
         else:
             encoding = detect_encoding(ogr_dataset, ogr_layer)
 
@@ -2078,23 +2089,28 @@ cdef create_ogr_dataset_layer(
                     dataset_options = NULL
                 raise exc
 
-        # Setup layer creation options
+        # Setup other layer creation options
+        for k, v in layer_kwargs.items():
+            k = k.encode('UTF-8')
+            v = v.encode('UTF-8')
+            layer_options = CSLAddNameValue(layer_options, <const char *>k, <const char *>v)
 
         if driver == 'ESRI Shapefile':
             # ENCODING option must be set for shapefiles to properly write *.cpg
             # file containing the encoding; this is not a supported option for
-            # other drivers
+            # other drivers.  This is done after setting general options above
+            # to override ENCODING if passed by the user as a layer option.
             if encoding is None:
                 encoding = "UTF-8"
+
+            else:
+                if "ENCODING" in layer_kwargs:
+                    raise ValueError('cannot provide both encoding parameter and "ENCODING" layer creation option; use the encoding parameter')
+
             encoding_b = encoding.upper().encode('UTF-8')
             encoding_c = encoding_b
             layer_options = CSLSetNameValue(layer_options, "ENCODING", encoding_c)
 
-        # Setup other layer creation options
-        for k, v in layer_kwargs.items():
-            k = k.encode('UTF-8')
-            v = v.encode('UTF-8')
-            layer_options = CSLAddNameValue(layer_options, <const char *>k, <const char *>v)
 
         ### Get geometry type
         # TODO: this is brittle for 3D / ZM / M types
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index e925a9c0..2cf39331 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -1711,6 +1711,33 @@ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
     assert actual[text].values[0] == text
 
 
+@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
+def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text):
+    """Verify that we write non-UTF data to the data source
+
+    IMPORTANT: this may not be valid for the data source and will likely render
+    them unusable in other tools, but should successfully roundtrip unless we
+    disable writing using other encodings.
+
+    NOTE: pyarrow cannot handle non-UTF-8 characters in this way
+    """
+
+    encoding, text = encoded_text
+    output_path = tmp_path / f"test.{ext}"
+
+    df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
+    write_dataframe(df, output_path, encoding=encoding)
+
+    # cannot open these files without specifying encoding
+    with pytest.raises(UnicodeDecodeError):
+        read_dataframe(output_path)
+
+    with pytest.raises(
+        ValueError, match="non-UTF-8 encoding is not supported for Arrow"
+    ):
+        read_dataframe(output_path, encoding=encoding, use_arrow=True)
+
+
 def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow):
     encoding, text = encoded_text
 
@@ -1746,6 +1773,38 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow):
     assert actual.columns[0] == text
     assert actual[text].values[0] == text
 
+    # if ENCODING open option, that should yield correct text
+    actual = read_dataframe(output_path, use_arrow=use_arrow, ENCODING=encoding)
+    assert actual.columns[0] == text
+    assert actual[text].values[0] == text
+
+
+def test_encoding_read_option_collision_shapefile(naturalearth_lowres, use_arrow):
+    """Providing both encoding parameter and ENCODING open option (even if blank) is not allowed"""
+
+    with pytest.raises(
+        ValueError, match='cannot provide both encoding parameter and "ENCODING" option'
+    ):
+        read_dataframe(
+            naturalearth_lowres, encoding="CP936", ENCODING="", use_arrow=use_arrow
+        )
+
+
+def test_encoding_write_layer_option_collision_shapefile(tmp_path, encoded_text):
+    """Providing both encoding parameter and ENCODING layer creation option (even if blank) is not allowed"""
+    encoding, text = encoded_text
+
+    output_path = tmp_path / "test.shp"
+    df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
+
+    with pytest.raises(
+        ValueError,
+        match='cannot provide both encoding parameter and "ENCODING" layer creation option',
+    ):
+        write_dataframe(
+            df, output_path, encoding=encoding, layer_options={"ENCODING": ""}
+        )
+
 
 def test_non_utf8_encoding_shapefile_sql(tmp_path, use_arrow):
     encoding = "CP936"

From b8cc9024bce92df356a1ad502d77ed7a32f4b8cc Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Wed, 17 Apr 2024 14:48:58 -0700
Subject: [PATCH 12/21] Fix missing pyarrow constraint for test

---
 pyogrio/tests/test_geopandas_io.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 2cf39331..f1610583 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -1711,6 +1711,7 @@ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
     assert actual[text].values[0] == text
 
 
+@requires_pyarrow_api
 @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
 def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text):
     """Verify that we write non-UTF data to the data source

From e7db258182cf7193ab9a08b2bacf39a7327587c8 Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Wed, 17 Apr 2024 15:33:51 -0700
Subject: [PATCH 13/21] Try to verify that platform default encoding is used
 for CSV by default

---
 pyogrio/tests/test_geopandas_io.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index f1610583..6f0030b7 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -1,6 +1,9 @@
 import contextlib
 from datetime import datetime
+import locale
 import os
+import sys
+
 import numpy as np
 import pytest
 
@@ -81,6 +84,19 @@ def test_read_csv_encoding(tmp_path, encoding):
     assert df.city.tolist() == ["Zürich"]
     assert df.näme.tolist() == ["Wilhelm Röntgen"]
 
+    # verify that read defaults to platform encoding
+    if (
+        sys.platform == "win32"
+        and encoding == "cp1252"
+        and locale.getpreferredencoding().lower() == encoding
+    ):
+        df = read_dataframe(csv_path)
+
+        assert len(df) == 1
+        assert df.columns.tolist() == ["näme", "city"]
+        assert df.city.tolist() == ["Zürich"]
+        assert df.näme.tolist() == ["Wilhelm Röntgen"]
+
 
 def test_read_dataframe(naturalearth_lowres_all_ext):
     df = read_dataframe(naturalearth_lowres_all_ext)

From eaecb18978dffaba341a7a1ae77595979d637eb4 Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Wed, 17 Apr 2024 15:40:10 -0700
Subject: [PATCH 14/21] split CSV platform encoding test out to verify it
 executes on Windows

---
 pyogrio/tests/test_geopandas_io.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 6f0030b7..f570d563 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -2,7 +2,6 @@
 from datetime import datetime
 import locale
 import os
-import sys
 
 import numpy as np
 import pytest
@@ -84,18 +83,24 @@ def test_read_csv_encoding(tmp_path, encoding):
     assert df.city.tolist() == ["Zürich"]
     assert df.näme.tolist() == ["Wilhelm Röntgen"]
 
-    # verify that read defaults to platform encoding
-    if (
-        sys.platform == "win32"
-        and encoding == "cp1252"
-        and locale.getpreferredencoding().lower() == encoding
-    ):
-        df = read_dataframe(csv_path)
 
-        assert len(df) == 1
-        assert df.columns.tolist() == ["näme", "city"]
-        assert df.city.tolist() == ["Zürich"]
-        assert df.näme.tolist() == ["Wilhelm Röntgen"]
+@pytest.mark.skipif(
+    locale.getpreferredencoding().upper() != "UTF-8",
+    reason="test requires non-UTF-8 default platform",
+)
+def test_read_csv_platform_encoding(tmp_path):
+    """verify that read defaults to platform encoding; only works on Windows (CP1252)"""
+    csv_path = tmp_path / "test.csv"
+    with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv:
+        csv.write("näme,city\n")
+        csv.write("Wilhelm Röntgen,Zürich\n")
+
+    df = read_dataframe(csv_path)
+
+    assert len(df) == 1
+    assert df.columns.tolist() == ["näme", "city"]
+    assert df.city.tolist() == ["Zürich"]
+    assert df.näme.tolist() == ["Wilhelm Röntgen"]
 
 
 def test_read_dataframe(naturalearth_lowres_all_ext):

From a6235017196b075c049dfb4b0a11fc16a080401e Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Wed, 17 Apr 2024 15:50:33 -0700
Subject: [PATCH 15/21] Fix bug in skip of CSV encoding test

---
 pyogrio/tests/test_geopandas_io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index f570d563..fcf8e5db 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -85,7 +85,7 @@ def test_read_csv_encoding(tmp_path, encoding):
 
 
 @pytest.mark.skipif(
-    locale.getpreferredencoding().upper() != "UTF-8",
+    locale.getpreferredencoding().upper() == "UTF-8",
     reason="test requires non-UTF-8 default platform",
 )
 def test_read_csv_platform_encoding(tmp_path):

From d2b43f34a1b2a5e5d811587496b8306698d9385a Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Thu, 25 Apr 2024 16:35:38 -0700
Subject: [PATCH 16/21] cleanup, add arrow I/O tests

---
 pyogrio/_io.pyx                    | 74 +++++++++-------------
 pyogrio/tests/test_arrow.py        | 98 ++++++++++++++++++++++++++++++
 pyogrio/tests/test_geopandas_io.py |  9 ---
 pyogrio/tests/test_raw_io.py       |  2 +-
 4 files changed, 128 insertions(+), 55 deletions(-)

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index 20f7a33b..78489657 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -138,15 +138,13 @@ cdef char** dict_to_options(object values):
     return options
 
 
-cdef const char* override_threadlocal_config_option(const char* key, const char* value):
+cdef const char* override_threadlocal_config_option(str key, str value):
     """Set the CPLSetThreadLocalConfigOption for key=value
 
     Parameters
     ----------
-    key : const char*
-        python string must decoded to UTF-8 before calling this
-    value : const char*
-        python string must decoded to UTF-8 before calling this
+    key : str
+    value : str
 
     Returns
     -------
@@ -154,14 +152,22 @@ cdef const char* override_threadlocal_config_option(const char* key, const char*
         value previously set for key, so that it can be later restored.  Caller
         is responsible for freeing this via CPLFree() if not NULL.
     """
-    cdef const char *prev_value = CPLGetThreadLocalConfigOption(key, NULL)
+
+    key_b = key.encode("UTF-8")
+    cdef const char* key_c = key_b
+
+    value_b = value.encode("UTF-8")
+    cdef const char* value_c = value_b
+
+
+    cdef const char *prev_value = CPLGetThreadLocalConfigOption(key_c, NULL)
     if prev_value != NULL:
         # strings returned from config options may be replaced via
         # CPLSetConfigOption() below; GDAL instructs us to save a copy
         # in a new string
         prev_value = CPLStrdup(prev_value)
 
-    CPLSetThreadLocalConfigOption(key, value)
+    CPLSetThreadLocalConfigOption(key_c, value_c)
 
     return prev_value
 
@@ -1191,16 +1197,13 @@ def ogr_read(
 
     try:
         if encoding:
-            override_shape_encoding = True
-
             # for shapefiles, SHAPE_ENCODING must be set before opening the file
             # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
             # SHAPE_ENCODING so that it can be restored later
             # (we do this for all data sources where encoding is set because
             # we don't know the driver until after it is opened, which is too late)
-            encoding_b = encoding.encode("UTF-8")
-            encoding_c = encoding_b
-            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c)
+            override_shape_encoding = True
+            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding)
 
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
@@ -1447,15 +1450,7 @@ def ogr_open_arrow(
     try:
         if encoding:
             override_shape_encoding = True
-
-            # for shapefiles, SHAPE_ENCODING must be set before opening the file
-            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
-            # SHAPE_ENCODING so that it can be restored later
-            # (we do this for all data sources where encoding is set because
-            # we don't know the driver until after it is opened, which is too late)
-            encoding_b = encoding.encode("UTF-8")
-            encoding_c = encoding_b
-            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c)
+            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding)
 
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
@@ -1473,12 +1468,9 @@ def ogr_open_arrow(
         # or from the system locale
         if encoding:
             if get_driver(ogr_dataset) == "ESRI Shapefile":
-                # NOTE: SHAPE_ENCODING is a configuration option whereas ENCODING is the dataset open option
                 if "ENCODING" in dataset_kwargs:
                     raise ValueError('cannot provide both encoding parameter and "ENCODING" option; use encoding parameter to specify correct encoding for data source')
 
-                # Because SHAPE_ENCODING is set above, GDAL will automatically
-                # decode shapefiles to UTF-8; ignore any encoding set by user
                 encoding = "UTF-8"
 
             elif encoding.replace('-','').upper() != 'UTF8':
@@ -1652,6 +1644,7 @@ def ogr_open_arrow(
             if prev_shape_encoding != NULL:
                 CPLFree(<void*>prev_shape_encoding)
 
+
 def ogr_read_bounds(
     str path,
     object layer=None,
@@ -1730,15 +1723,7 @@ def ogr_read_info(
     try:
         if encoding:
             override_shape_encoding = True
-
-            # for shapefiles, SHAPE_ENCODING must be set before opening the file
-            # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
-            # SHAPE_ENCODING so that it can be restored later
-            # (we do this for all data sources where encoding is set because
-            # we don't know the driver until after it is opened, which is too late)
-            encoding_b = encoding.encode("UTF-8")
-            encoding_c = encoding_b
-            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c)
+            prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding)
 
         dataset_options = dict_to_options(dataset_kwargs)
         ogr_dataset = ogr_open(path_c, 0, dataset_options)
@@ -1747,13 +1732,8 @@ def ogr_read_info(
             layer = get_default_layer(ogr_dataset)
         ogr_layer = get_ogr_layer(ogr_dataset, layer)
 
-        # Encoding is derived from the user, from the dataset capabilities / type,
-        # or from the system locale
         if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
-            # Because SHAPE_ENCODING is set above, GDAL will automatically
-            # decode shapefiles to UTF-8; ignore any encoding set by user
             encoding = "UTF-8"
-
         else:
             encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
@@ -2102,13 +2082,11 @@ cdef create_ogr_dataset_layer(
             # file containing the encoding; this is not a supported option for
             # other drivers.  This is done after setting general options above
             # to override ENCODING if passed by the user as a layer option.
-            if encoding is None:
-                encoding = "UTF-8"
-
-            else:
-                if "ENCODING" in layer_kwargs:
-                    raise ValueError('cannot provide both encoding parameter and "ENCODING" layer creation option; use the encoding parameter')
+            if encoding and "ENCODING" in layer_kwargs:
+                raise ValueError('cannot provide both encoding parameter and "ENCODING" layer creation option; use the encoding parameter')
 
+            # always write to UTF-8 if encoding is not set
+            encoding = encoding or "UTF-8"
             encoding_b = encoding.upper().encode('UTF-8')
             encoding_c = encoding_b
             layer_options = CSLSetNameValue(layer_options, "ENCODING", encoding_c)
@@ -2226,7 +2204,6 @@ def ogr_write(
         &ogr_dataset, &ogr_layer,
     )
 
-
     if driver == 'ESRI Shapefile':
         # force encoding for remaining operations to be in UTF-8 (even if user
         # provides an encoding) because GDAL will automatically convert those to
@@ -2479,10 +2456,17 @@ def ogr_write_arrow(
         &ogr_dataset, &ogr_layer,
     )
 
+    # only shapefile supports non-UTF encoding because ENCODING option is set
+    # during dataset creation and GDAL auto-translates from UTF-8 values to that
+    # encoding
+    if encoding and encoding.replace('-','').upper() != 'UTF8' and driver != 'ESRI Shapefile':
+        raise ValueError("non-UTF-8 encoding is not supported for Arrow; use the non-Arrow interface instead")
+
     if geometry_name:
         opts = {"GEOMETRY_NAME": geometry_name}
     else:
         opts = {}
+
     options = dict_to_options(opts)
 
     try:
diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py
index a3b165c5..05b27ac7 100644
--- a/pyogrio/tests/test_arrow.py
+++ b/pyogrio/tests/test_arrow.py
@@ -14,6 +14,7 @@
     read_dataframe,
     read_info,
     list_layers,
+    get_gdal_config_option,
     set_gdal_config_options,
 )
 from pyogrio.raw import open_arrow, read_arrow, write, write_arrow
@@ -797,3 +798,100 @@ def test_write_schema_error_message(tmpdir):
             geometry_type="Point",
             geometry_name="geometry",
         )
+
+
+@requires_arrow_write_api
+def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text):
+    encoding, text = encoded_text
+
+    table = pa.table(
+        {
+            # Point(0, 0)
+            "geometry": pa.array(
+                [bytes.fromhex("010100000000000000000000000000000000000000")]
+            ),
+            text: pa.array([text]),
+        }
+    )
+
+    filename = tmp_path / "test.shp"
+    write_arrow(
+        table,
+        filename,
+        geometry_type="Point",
+        geometry_name="geometry",
+        crs="EPSG:4326",
+        encoding=encoding,
+    )
+
+    # NOTE: GDAL automatically creates a cpg file with the encoding name, which
+    # means that if we read this without specifying the encoding it uses the
+    # correct one
+    schema, table = read_arrow(filename)
+    assert schema["fields"][0] == text
+    assert table[text][0].as_py() == text
+
+    # verify that if cpg file is not present, that user-provided encoding must be used
+    os.unlink(str(filename).replace(".shp", ".cpg"))
+
+    # We will assume ISO-8859-1, which is wrong
+    miscoded = text.encode(encoding).decode("ISO-8859-1")
+    bad_schema = read_arrow(filename)[0]
+    assert bad_schema["fields"][0] == miscoded
+    # table cannot be decoded to UTF-8 without UnicodeDecodeErrors
+
+    # If encoding is provided, that should yield correct text
+    schema, table = read_arrow(filename, encoding=encoding)
+    assert schema["fields"][0] == text
+    assert table[text][0].as_py() == text
+
+    # verify that setting encoding does not corrupt SHAPE_ENCODING option if set
+    # globally (it is ignored during read when encoding is specified by user)
+    try:
+        set_gdal_config_options({"SHAPE_ENCODING": "CP1254"})
+        _ = read_arrow(filename, encoding=encoding)
+        assert get_gdal_config_option("SHAPE_ENCODING") == "CP1254"
+
+    finally:
+        # reset to clear between tests
+        set_gdal_config_options({"SHAPE_ENCODING": None})
+
+
+@requires_arrow_write_api
+def test_encoding_write_layer_option_collision_shapefile(tmpdir, naturalearth_lowres):
+    """Providing both encoding parameter and ENCODING layer creation option (even if blank) is not allowed"""
+
+    meta, table = read_arrow(naturalearth_lowres)
+    filename = os.path.join(str(tmpdir), "test.shp")
+
+    with pytest.raises(
+        ValueError,
+        match='cannot provide both encoding parameter and "ENCODING" layer creation option',
+    ):
+        write_arrow(
+            table,
+            filename,
+            crs=meta["crs"],
+            geometry_type="MultiPolygon",
+            geometry_name=meta["geometry_name"] or "wkb_geometry",
+            encoding="CP936",
+            layer_options={"ENCODING": ""},
+        )
+
+
+@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
+def test_non_utf8_encoding_io_arrow_exception(tmpdir, naturalearth_lowres, ext):
+    meta, table = read_arrow(naturalearth_lowres)
+    filename = os.path.join(str(tmpdir), f"test.{ext}")
+
+    with pytest.raises(
+        ValueError, match="non-UTF-8 encoding is not supported for Arrow"
+    ):
+        write_arrow(
+            table,
+            filename,
+            crs=meta["crs"],
+            geometry_type="MultiPolygon",
+            geometry_name=meta["geometry_name"] or "wkb_geometry",
+            encoding="CP936",
+        )
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index fcf8e5db..72fa5c3c 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -1735,15 +1735,6 @@ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
 @requires_pyarrow_api
 @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
 def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text):
-    """Verify that we write non-UTF data to the data source
-
-    IMPORTANT: this may not be valid for the data source and will likely render
-    them unusable in other tools, but should successfully roundtrip unless we
-    disable writing using other encodings.
-
-    NOTE: pyarrow cannot handle non-UTF-8 characters in this way
-    """
-
     encoding, text = encoded_text
     output_path = tmp_path / f"test.{ext}"
 
diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py
index 6eb1acaa..d1526ee3 100644
--- a/pyogrio/tests/test_raw_io.py
+++ b/pyogrio/tests/test_raw_io.py
@@ -1237,8 +1237,8 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text):
     os.unlink(str(filename).replace(".shp", ".cpg"))
 
     # We will assume ISO-8859-1, which is wrong
-    bad_meta, _, _, bad_field_data = read(filename)
     miscoded = text.encode(encoding).decode("ISO-8859-1")
+    bad_meta, _, _, bad_field_data = read(filename)
     assert bad_meta["fields"][0] == miscoded
     assert bad_field_data[0] == miscoded
     assert read_info(filename)["fields"][0] == miscoded

From 815620a64168fc0cfe17392b98ceb3a91ce3d232 Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Thu, 25 Apr 2024 17:54:24 -0700
Subject: [PATCH 17/21] Fix missing test annotation

---
 pyogrio/tests/test_arrow.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py
index 05b27ac7..225fbb57 100644
--- a/pyogrio/tests/test_arrow.py
+++ b/pyogrio/tests/test_arrow.py
@@ -879,6 +879,7 @@ def test_encoding_write_layer_option_collision_shapefile(tmpdir, naturalearth_lo
         )
 
 
+@requires_arrow_write_api
 @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
 def test_non_utf8_encoding_io_arrow_exception(tmpdir, naturalearth_lowres, ext):
     meta, table = read_arrow(naturalearth_lowres)

From 6eca2651945ffb0f25ac5cd33cc04d36959c88ea Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Thu, 25 Apr 2024 18:03:47 -0700
Subject: [PATCH 18/21] Don't fail in error handler if message cannot be
 decoded to UTF-8

---
 pyogrio/_err.pyx | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pyogrio/_err.pyx b/pyogrio/_err.pyx
index 089042c5..aa8461af 100644
--- a/pyogrio/_err.pyx
+++ b/pyogrio/_err.pyx
@@ -153,9 +153,13 @@ cdef inline object exc_check():
     else:
         # Reformat messages.
         msg_b = err_msg
-        msg = msg_b.decode('utf-8')
-        msg = msg.replace("`", "'")
-        msg = msg.replace("\n", " ")
+
+        try:
+            msg = msg_b.decode('utf-8')
+            msg = msg.replace("`", "'")
+            msg = msg.replace("\n", " ")
+        except UnicodeDecodeError as exc:
+            msg = f"Could not decode error message to UTF-8.  Raw error bytes: {msg}"
 
     if err_type == 3:
         CPLErrorReset()

From 34bacc31a001c7a7b73755566b81076eb8fbb1f2 Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Thu, 25 Apr 2024 18:06:28 -0700
Subject: [PATCH 19/21] Fix bug in attempted fix

---
 pyogrio/_err.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyogrio/_err.pyx b/pyogrio/_err.pyx
index aa8461af..04097c7a 100644
--- a/pyogrio/_err.pyx
+++ b/pyogrio/_err.pyx
@@ -159,7 +159,7 @@ cdef inline object exc_check():
             msg = msg.replace("`", "'")
             msg = msg.replace("\n", " ")
         except UnicodeDecodeError as exc:
-            msg = f"Could not decode error message to UTF-8.  Raw error bytes: {msg}"
+            msg = f"Could not decode error message to UTF-8.  Raw error bytes: {msg_b}"
 
     if err_type == 3:
         CPLErrorReset()

From e088311dad7313a7d8afb17183ba32d04b714537 Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Thu, 25 Apr 2024 18:19:59 -0700
Subject: [PATCH 20/21] Fix failing test for GDAL >= 3.9

---
 pyogrio/_err.pyx                   | 2 +-
 pyogrio/tests/test_geopandas_io.py | 4 +++-
 pyogrio/tests/test_raw_io.py       | 4 +++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pyogrio/_err.pyx b/pyogrio/_err.pyx
index 04097c7a..51f2fcfb 100644
--- a/pyogrio/_err.pyx
+++ b/pyogrio/_err.pyx
@@ -159,7 +159,7 @@ cdef inline object exc_check():
             msg = msg.replace("`", "'")
             msg = msg.replace("\n", " ")
         except UnicodeDecodeError as exc:
-            msg = f"Could not decode error message to UTF-8.  Raw error bytes: {msg_b}"
+            msg = f"Could not decode error message to UTF-8.  Raw error: {msg_b}"
 
     if err_type == 3:
         CPLErrorReset()
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 72fa5c3c..a6360d4c 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -1705,7 +1705,7 @@ def test_arrow_bool_exception(tmpdir, ext):
         _ = read_dataframe(filename, use_arrow=True)
 
 
-@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
+@pytest.mark.parametrize("ext", ["gpkg", "geojson"])
 def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
     """Verify that we write non-UTF data to the data source
 
@@ -1713,6 +1713,8 @@ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
     them unusable in other tools, but should successfully roundtrip unless we
     disable writing using other encodings.
 
+    NOTE: FlatGeobuff driver cannot handle non-UTF data in GDAL >= 3.9
+
     NOTE: pyarrow cannot handle non-UTF-8 characters in this way
     """
 
diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py
index d1526ee3..915c9438 100644
--- a/pyogrio/tests/test_raw_io.py
+++ b/pyogrio/tests/test_raw_io.py
@@ -1172,13 +1172,15 @@ def test_encoding_io_shapefile(tmp_path, read_encoding, write_encoding):
         )
 
 
-@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
+@pytest.mark.parametrize("ext", ["gpkg", "geojson"])
 def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
     """Verify that we write non-UTF data to the data source
 
     IMPORTANT: this may not be valid for the data source and will likely render
     them unusable in other tools, but should successfully roundtrip unless we
     disable writing using other encodings.
+
+    NOTE: FlatGeobuff driver cannot handle non-UTF data in GDAL >= 3.9
     """
     encoding, text = encoded_text
 

From 58d1d3f5c681fc0131259d75735fc7e80b0dd1bc Mon Sep 17 00:00:00 2001
From: Brendan Ward <bcward@astutespruce.com>
Date: Thu, 25 Apr 2024 18:24:10 -0700
Subject: [PATCH 21/21] Fix other failing FlatGeobuff tests

---
 pyogrio/tests/test_arrow.py        | 2 +-
 pyogrio/tests/test_geopandas_io.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py
index 225fbb57..9c134d05 100644
--- a/pyogrio/tests/test_arrow.py
+++ b/pyogrio/tests/test_arrow.py
@@ -880,7 +880,7 @@ def test_encoding_write_layer_option_collision_shapefile(tmpdir, naturalearth_lo
 
 
 @requires_arrow_write_api
-@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
+@pytest.mark.parametrize("ext", ["gpkg", "geojson"])
 def test_non_utf8_encoding_io_arrow_exception(tmpdir, naturalearth_lowres, ext):
     meta, table = read_arrow(naturalearth_lowres)
     filename = os.path.join(str(tmpdir), f"test.{ext}")
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index a6360d4c..dbda1173 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -1735,7 +1735,7 @@ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
 
 
 @requires_pyarrow_api
-@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"])
+@pytest.mark.parametrize("ext", ["gpkg", "geojson"])
 def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text):
     encoding, text = encoded_text
     output_path = tmp_path / f"test.{ext}"