From 2bfc862cf5a98642d9ae97748fbc98ce5d8e6f0d Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Thu, 4 Apr 2024 19:37:55 -0700 Subject: [PATCH 01/21] Use SHAPE_ENCODING to disable auto-decoding of shapefiles --- pyogrio/_io.pyx | 92 ++++++++++++++++++++++++++++++++++++++++++++++-- pyogrio/_ogr.pxd | 1 + 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 733064af..d4aa7b8a 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1111,6 +1111,9 @@ def ogr_read( cdef OGRLayerH ogr_layer = NULL cdef int feature_count = 0 cdef double xmin, ymin, xmax, ymax + cdef const char* prev_shape_encoding_c = NULL + cdef char* prev_shape_encoding = NULL + cdef bint override_shape_encoding = False path_b = path.encode('utf-8') path_c = path_b @@ -1142,6 +1145,25 @@ def ogr_read( raise ValueError("'max_features' must be >= 0") try: + if encoding: + override_shape_encoding = True + + # for shapefiles, SHAPE_ENCODING must be set before opening the file + # to prevent automatic decoding to UTF-8 by GDAL, so we save previous + # SHAPE_ENCODING so that it can be restored later + # (we do this for all data sources where encoding is set because + # we don't know the driver until after it is opened, which is too late) + prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL) + if prev_shape_encoding_c != NULL: + # strings returned from config options may be replaced via + # CPLSetConfigOption() below; GDAL instructs us to save a copy + # in a new string + prev_shape_encoding = CPLStrdup(prev_shape_encoding_c) + + # set empty string to disable auto-decoding by GDAL to UTF-8 + CPLSetConfigOption("SHAPE_ENCODING", "") + + dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1156,6 +1178,7 @@ def ogr_read( # Encoding is derived from the user, from the dataset capabilities / type, # or from the system locale + # FIXME: it appears we need to set ENCODING sooner encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) fields = get_fields(ogr_layer, encoding) @@ -1249,6 +1272,13 @@ def ogr_read( GDALClose(ogr_dataset) ogr_dataset = NULL + # reset SHAPE_ENCODING config parameter if temporarily set above + if override_shape_encoding: + CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding) + + if prev_shape_encoding != NULL: + CPLFree(prev_shape_encoding) + return ( meta, fid_data, @@ -1285,6 +1315,9 @@ def ogr_open_arrow( cdef char **fields_c = NULL cdef const char *field_c = NULL cdef char **options = NULL + cdef const char* prev_shape_encoding_c = NULL + cdef char* prev_shape_encoding = NULL + cdef bint override_shape_encoding = False cdef ArrowArrayStream stream cdef ArrowSchema schema @@ -1328,6 +1361,25 @@ def ogr_open_arrow( reader = None try: + if encoding: + override_shape_encoding = True + + # for shapefiles, SHAPE_ENCODING must be set before opening the file + # to prevent automatic decoding to UTF-8 by GDAL, so we save previous + # SHAPE_ENCODING so that it can be restored later + # (we do this for all data sources where encoding is set because + # we don't know the driver until after it is opened, which is too late) + prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL) + if prev_shape_encoding_c != NULL: + # strings returned from config options may be replaced via + # CPLSetConfigOption() below; GDAL instructs us to save a copy + # in a new string + prev_shape_encoding = CPLStrdup(prev_shape_encoding_c) + + # set empty string to disable auto-decoding by GDAL to UTF-8 + CPLSetConfigOption("SHAPE_ENCODING", "") + + dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1450,6 +1502,13 @@ def ogr_open_arrow( GDALClose(ogr_dataset) ogr_dataset = NULL + # reset SHAPE_ENCODING config parameter if temporarily set above + if override_shape_encoding: + CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding) + + if prev_shape_encoding != NULL: + CPLFree(prev_shape_encoding) + def ogr_read_bounds( str path, object layer=None, @@ -1518,12 +1577,32 @@ def ogr_read_info( cdef char **dataset_options = NULL cdef OGRDataSourceH ogr_dataset = NULL cdef OGRLayerH ogr_layer = NULL + cdef const char* prev_shape_encoding = NULL + cdef bint override_shape_encoding = False path_b = path.encode('utf-8') path_c = path_b - try: + if encoding: + override_shape_encoding = True + + # for shapefiles, SHAPE_ENCODING must be set before opening the file + # to prevent automatic decoding to UTF-8 by GDAL, so we save previous + # SHAPE_ENCODING so that it can be restored later + # (we do this for all data sources where encoding is set because + # we don't know the driver until after it is opened, which is too late) + prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL) + if prev_shape_encoding_c != NULL: + # strings returned from config options may be replaced via + # CPLSetConfigOption() below; GDAL instructs us to save a copy + # in a new string + prev_shape_encoding = CPLStrdup(prev_shape_encoding_c) + + # set empty string to disable auto-decoding by GDAL to UTF-8 + CPLSetConfigOption("SHAPE_ENCODING", "") + + dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1566,6 +1645,13 @@ def ogr_read_info( GDALClose(ogr_dataset) ogr_dataset = NULL + # reset SHAPE_ENCODING config parameter if temporarily set above + if override_shape_encoding: + CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding) + + if prev_shape_encoding != NULL: + CPLFree(prev_shape_encoding) + return meta @@ -1600,7 +1686,7 @@ cdef str get_default_layer(OGRDataSourceH ogr_dataset): ------- str the name of the default layer to be read. - + """ layers = get_layer_names(ogr_dataset) first_layer_name = layers[0][0] @@ -1632,7 +1718,7 @@ cdef get_layer_names(OGRDataSourceH ogr_dataset): ------- ndarray(n) array of layer names - + """ cdef OGRLayerH ogr_layer = NULL diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index 35fbd29a..e62241dc 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -12,6 +12,7 @@ cdef extern from "cpl_conv.h": const char* CPLFindFile(const char *pszClass, const char *filename) const char* CPLGetConfigOption(const char* key, const char* value) void CPLSetConfigOption(const char* key, const char* value) + char* CPLStrdup(const char* string) cdef extern from "cpl_error.h" nogil: From 2b38f9466d92fa2f33c9defc102f9872a9d05afb Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Thu, 4 Apr 2024 21:18:03 -0700 Subject: [PATCH 02/21] Correctly handle I/O for non-UTF-8 shapefiles --- CHANGES.md | 4 +++- pyogrio/_io.pyx | 20 ++++++++++++------ pyogrio/tests/test_raw_io.py | 39 ++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 7086b1e9..5c944663 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,7 +9,6 @@ including the CRS, when using GDAL 3.8 or higher (#366). - Warn when reading from a multilayer file without specifying a layer (#362). - ### Bug fixes - Fix error in `write_dataframe` if input has a date column and @@ -19,6 +18,9 @@ - Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if a boolean column is detected due to error in GDAL reading boolean values (#335) this has been fixed in GDAL >= 3.8.3. +- Disable GDAL's auto-decoding of ESRI Shapefiles to `UTF-8` in order to enable + user-provided `encoding` option to work properly for `read`, `read_dataframe`, and `open_arrow`, and correctly encode Shapefile field names and text values to the + user-provided `encoding` (#384). ### Packaging diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index d4aa7b8a..04c9190e 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1948,8 +1948,9 @@ cdef create_ogr_dataset_layer( # Setup layer creation options if driver == 'ESRI Shapefile': - # Fiona only sets encoding for shapefiles; other drivers do not support - # encoding as an option. + # ENCODING option must be set for shapefiles to properly write *.cpg + # file containing the encoding; this is not a supported option for + # other drivers if encoding is None: encoding = "UTF-8" encoding_b = encoding.upper().encode('UTF-8') @@ -2074,10 +2075,17 @@ def ogr_write( &ogr_dataset, &ogr_layer, ) - # Now the dataset and layer have been created, we can properly determine the - # encoding. It is derived from the user, from the dataset capabilities / type, - # or from the system locale - encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) + + if driver == 'ESRI Shapefile': + # force encoding for remaining operations to be in UTF-8 because + # GDAL will automatically convert those to the target encoding + encoding = "UTF-8" + + else: + # Now the dataset and layer have been created, we can properly determine the + # encoding. It is derived from the user, from the dataset capabilities / type, + # or from the system locale + encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) ### Create the fields field_types = None diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 362c5773..8f711338 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1168,6 +1168,45 @@ def test_encoding_io_shapefile(tmp_path, read_encoding, write_encoding): ) +def test_non_utf8_encoding_shapefile(tmp_path): + encoding = "CP936" + + # Point(0, 0) + geometry = np.array( + [bytes.fromhex("010100000000000000000000000000000000000000")], dtype=object + ) + + mandarin = "中文" + field_data = [np.array([mandarin], dtype=object)] + + fields = [mandarin] + meta = dict(geometry_type="Point", crs="EPSG:4326", encoding=encoding) + + filename = tmp_path / "test.shp" + # NOTE: GDAL automatically creates a cpg file with the encoding name, which + # means that if we read this without specifying the encoding it uses the + # correct one + write(filename, geometry, field_data, fields, **meta) + + actual_meta, _, _, actual_field_data = read(filename) + assert np.array_equal(fields, actual_meta["fields"]) + assert np.array_equal(field_data, actual_field_data) + assert np.array_equal(fields, read_info(filename)["fields"]) + + # verify that if cpg file is not present, that user-provided encoding must be used + os.unlink(str(filename).replace(".shp", ".cpg")) + + bad_meta, _, _, bad_field_data = read(filename) + assert not np.array_equal(fields, bad_meta["fields"]) + assert not np.array_equal(field_data, bad_field_data) + assert not np.array_equal(fields, read_info(filename)["fields"]) + + actual_meta, _, _, actual_field_data = read(filename, encoding=encoding) + assert np.array_equal(fields, actual_meta["fields"]) + assert np.array_equal(field_data, actual_field_data) + assert np.array_equal(fields, read_info(filename, encoding=encoding)["fields"]) + + def test_write_with_mask(tmp_path): # Point(0, 0), null geometry = np.array( From e738f7b8aec3ca540c05ece7f022cbe087510e73 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Fri, 5 Apr 2024 18:12:29 -0700 Subject: [PATCH 03/21] Use thread local config, add more tests --- CHANGES.md | 6 +-- pyogrio/_io.pyx | 56 ++++++++++++++++-------- pyogrio/_ogr.pxd | 2 + pyogrio/geopandas.py | 3 +- pyogrio/tests/test_geopandas_io.py | 69 ++++++++++++++++++++++++++++++ pyogrio/tests/test_raw_io.py | 66 ++++++++++++++++++++++------ 6 files changed, 169 insertions(+), 33 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 5c944663..6740bd3b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -18,9 +18,9 @@ - Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if a boolean column is detected due to error in GDAL reading boolean values (#335) this has been fixed in GDAL >= 3.8.3. -- Disable GDAL's auto-decoding of ESRI Shapefiles to `UTF-8` in order to enable - user-provided `encoding` option to work properly for `read`, `read_dataframe`, and `open_arrow`, and correctly encode Shapefile field names and text values to the - user-provided `encoding` (#384). +- Properly handle decoding of ESRI Shapefiles with user-provided `encoding` + option for `read`, `read_dataframe`, and `open_arrow`, and correctly encode + Shapefile field names and text values to the user-provided `encoding` (#384). ### Packaging diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 04c9190e..852adadd 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1153,15 +1153,17 @@ def ogr_read( # SHAPE_ENCODING so that it can be restored later # (we do this for all data sources where encoding is set because # we don't know the driver until after it is opened, which is too late) - prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL) + prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL) if prev_shape_encoding_c != NULL: # strings returned from config options may be replaced via # CPLSetConfigOption() below; GDAL instructs us to save a copy # in a new string prev_shape_encoding = CPLStrdup(prev_shape_encoding_c) - # set empty string to disable auto-decoding by GDAL to UTF-8 - CPLSetConfigOption("SHAPE_ENCODING", "") + # set encoding used automatically by GDAL + encoding_b = encoding.encode('UTF-8') + encoding_c = encoding_b + CPLSetThreadLocalConfigOption("SHAPE_ENCODING", encoding_c) dataset_options = dict_to_options(dataset_kwargs) @@ -1178,8 +1180,13 @@ def ogr_read( # Encoding is derived from the user, from the dataset capabilities / type, # or from the system locale - # FIXME: it appears we need to set ENCODING sooner - encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) + if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": + # Because SHAPE_ENCODING is set above, GDAL will automatically decode + # to UTF-8 + encoding = "UTF-8" + + else: + encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) fields = get_fields(ogr_layer, encoding) @@ -1274,7 +1281,7 @@ def ogr_read( # reset SHAPE_ENCODING config parameter if temporarily set above if override_shape_encoding: - CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding) + CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding) if prev_shape_encoding != NULL: CPLFree(prev_shape_encoding) @@ -1369,16 +1376,17 @@ def ogr_open_arrow( # SHAPE_ENCODING so that it can be restored later # (we do this for all data sources where encoding is set because # we don't know the driver until after it is opened, which is too late) - prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL) + prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL) if prev_shape_encoding_c != NULL: # strings returned from config options may be replaced via # CPLSetConfigOption() below; GDAL instructs us to save a copy # in a new string prev_shape_encoding = CPLStrdup(prev_shape_encoding_c) - # set empty string to disable auto-decoding by GDAL to UTF-8 - CPLSetConfigOption("SHAPE_ENCODING", "") - + # set encoding used automatically by GDAL + encoding_b = encoding.encode('UTF-8') + encoding_c = encoding_b + CPLSetThreadLocalConfigOption("SHAPE_ENCODING", encoding_c) dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1394,7 +1402,13 @@ def ogr_open_arrow( # Encoding is derived from the user, from the dataset capabilities / type, # or from the system locale - encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) + if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": + # Because SHAPE_ENCODING is set above, GDAL will automatically decode + # to UTF-8 + encoding = "UTF-8" + + else: + encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) fields = get_fields(ogr_layer, encoding, use_arrow=True) @@ -1504,7 +1518,7 @@ def ogr_open_arrow( # reset SHAPE_ENCODING config parameter if temporarily set above if override_shape_encoding: - CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding) + CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding) if prev_shape_encoding != NULL: CPLFree(prev_shape_encoding) @@ -1592,15 +1606,17 @@ def ogr_read_info( # SHAPE_ENCODING so that it can be restored later # (we do this for all data sources where encoding is set because # we don't know the driver until after it is opened, which is too late) - prev_shape_encoding_c = CPLGetConfigOption("SHAPE_ENCODING", NULL) + prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL) if prev_shape_encoding_c != NULL: # strings returned from config options may be replaced via # CPLSetConfigOption() below; GDAL instructs us to save a copy # in a new string prev_shape_encoding = CPLStrdup(prev_shape_encoding_c) - # set empty string to disable auto-decoding by GDAL to UTF-8 - CPLSetConfigOption("SHAPE_ENCODING", "") + # set encoding used automatically by GDAL + encoding_b = encoding.encode('UTF-8') + encoding_c = encoding_b + CPLSetThreadLocalConfigOption("SHAPE_ENCODING", encoding_c) dataset_options = dict_to_options(dataset_kwargs) @@ -1612,7 +1628,13 @@ def ogr_read_info( # Encoding is derived from the user, from the dataset capabilities / type, # or from the system locale - encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) + if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": + # Because SHAPE_ENCODING is set above, GDAL will automatically decode + # to UTF-8 + encoding = "UTF-8" + + else: + encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) fields = get_fields(ogr_layer, encoding) @@ -1647,7 +1669,7 @@ def ogr_read_info( # reset SHAPE_ENCODING config parameter if temporarily set above if override_shape_encoding: - CPLSetConfigOption("SHAPE_ENCODING", prev_shape_encoding) + CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding) if prev_shape_encoding != NULL: CPLFree(prev_shape_encoding) diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index e62241dc..1fd3772e 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -12,6 +12,8 @@ cdef extern from "cpl_conv.h": const char* CPLFindFile(const char *pszClass, const char *filename) const char* CPLGetConfigOption(const char* key, const char* value) void CPLSetConfigOption(const char* key, const char* value) + const char* CPLGetThreadLocalConfigOption(const char* key, const char* value) + void CPLSetThreadLocalConfigOption(const char* key, const char* value) char* CPLStrdup(const char* string) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 1cbcb113..f647126e 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -349,7 +349,8 @@ def write_dataframe( attempts to infer driver from path. encoding : str, optional (default: None) If present, will be used as the encoding for writing string values to - the file. + the file. Use with caution, only certain drivers support encodings + other than UTF-8. geometry_type : string, optional (default: None) By default, the geometry type of the layer will be inferred from the data, after applying the promote_to_multi logic. If the data only contains a diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 412b95b6..11c14f0a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1611,3 +1611,72 @@ def test_arrow_bool_exception(tmpdir): "the Arrow API", ): read_dataframe(filename, use_arrow=True) + + +@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) +def test_non_utf8_encoding_io(tmp_path, ext): + """Verify that we write non-UTF data to the data source + + IMPORTANT: this may not be valid for the data source and will likely render + them unusable in other tools, but should successfully roundtrip unless we + disable writing using other encodings. + + NOTE: pyarrow cannot handle non-UTF-8 characters in this way + """ + encoding = "CP936" + + output_path = tmp_path / f"test.{ext}" + + mandarin = "中文" + df = gp.GeoDataFrame( + {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326" + ) + write_dataframe(df, output_path, encoding=encoding) + + # cannot open these files without specifying encoding + with pytest.raises(UnicodeDecodeError): + read_dataframe(output_path) + + # must provide encoding to read these properly + actual = read_dataframe(output_path, encoding=encoding) + assert actual.columns[0] == mandarin + assert actual[mandarin].values[0] == mandarin + + +def test_non_utf8_encoding_io_shapefile(tmp_path, use_arrow): + encoding = "CP936" + + output_path = tmp_path / "test.shp" + + mandarin = "中文" + df = gp.GeoDataFrame( + {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326" + ) + write_dataframe(df, output_path, encoding=encoding) + + # NOTE: GDAL automatically creates a cpg file with the encoding name, which + # means that if we read this without specifying the encoding it uses the + # correct one + actual = read_dataframe(output_path, use_arrow=use_arrow) + assert actual.columns[0] == mandarin + assert actual[mandarin].values[0] == mandarin + + # verify that if cpg file is not present, that user-provided encoding must be used + os.unlink(str(output_path).replace(".shp", ".cpg")) + + # We will assume ISO-8859-1, which is wrong + miscoded = mandarin.encode("CP936").decode("ISO-8859-1") + + if use_arrow: + # pyarrow cannot decode column name with incorrect encoding + with pytest.raises(UnicodeDecodeError): + read_dataframe(output_path, use_arrow=True) + else: + bad = read_dataframe(output_path, use_arrow=False) + assert bad.columns[0] == miscoded + assert bad[miscoded].values[0] == miscoded + + # If encoding is provided, that should yield correct text + actual = read_dataframe(output_path, encoding=encoding, use_arrow=use_arrow) + assert actual.columns[0] == mandarin + assert actual[mandarin].values[0] == mandarin diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 8f711338..6b255462 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1168,7 +1168,46 @@ def test_encoding_io_shapefile(tmp_path, read_encoding, write_encoding): ) -def test_non_utf8_encoding_shapefile(tmp_path): +@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) +def test_non_utf8_encoding_io(tmp_path, ext): + """Verify that we write non-UTF data to the data source + + IMPORTANT: this may not be valid for the data source and will likely render + them unusable in other tools, but should successfully roundtrip unless we + disable writing using other encodings. + """ + + encoding = "CP936" + + # Point(0, 0) + geometry = np.array( + [bytes.fromhex("010100000000000000000000000000000000000000")], dtype=object + ) + + mandarin = "中文" + field_data = [np.array([mandarin], dtype=object)] + + fields = [mandarin] + meta = dict(geometry_type="Point", crs="EPSG:4326", encoding=encoding) + + filename = tmp_path / f"test.{ext}" + write(filename, geometry, field_data, fields, **meta) + + # cannot open these files without specifying encoding + with pytest.raises(UnicodeDecodeError): + read(filename) + + with pytest.raises(UnicodeDecodeError): + read_info(filename) + + # must provide encoding to read these properly + actual_meta, _, _, actual_field_data = read(filename, encoding=encoding) + assert actual_meta["fields"][0] == mandarin + assert actual_field_data[0] == mandarin + assert read_info(filename, encoding=encoding)["fields"][0] == mandarin + + +def test_non_utf8_encoding_io_shapefile(tmp_path): encoding = "CP936" # Point(0, 0) @@ -1183,28 +1222,31 @@ def test_non_utf8_encoding_shapefile(tmp_path): meta = dict(geometry_type="Point", crs="EPSG:4326", encoding=encoding) filename = tmp_path / "test.shp" + write(filename, geometry, field_data, fields, **meta) + # NOTE: GDAL automatically creates a cpg file with the encoding name, which # means that if we read this without specifying the encoding it uses the # correct one - write(filename, geometry, field_data, fields, **meta) - actual_meta, _, _, actual_field_data = read(filename) - assert np.array_equal(fields, actual_meta["fields"]) - assert np.array_equal(field_data, actual_field_data) - assert np.array_equal(fields, read_info(filename)["fields"]) + assert actual_meta["fields"][0] == mandarin + assert actual_field_data[0] == mandarin + assert read_info(filename)["fields"][0] == mandarin # verify that if cpg file is not present, that user-provided encoding must be used os.unlink(str(filename).replace(".shp", ".cpg")) + # We will assume ISO-8859-1, which is wrong bad_meta, _, _, bad_field_data = read(filename) - assert not np.array_equal(fields, bad_meta["fields"]) - assert not np.array_equal(field_data, bad_field_data) - assert not np.array_equal(fields, read_info(filename)["fields"]) + miscoded = mandarin.encode("CP936").decode("ISO-8859-1") + assert bad_meta["fields"][0] == miscoded + assert bad_field_data[0] == miscoded + assert read_info(filename)["fields"][0] == miscoded + # If encoding is provided, that should yield correct text actual_meta, _, _, actual_field_data = read(filename, encoding=encoding) - assert np.array_equal(fields, actual_meta["fields"]) - assert np.array_equal(field_data, actual_field_data) - assert np.array_equal(fields, read_info(filename, encoding=encoding)["fields"]) + assert actual_meta["fields"][0] == mandarin + assert actual_field_data[0] == mandarin + assert read_info(filename, encoding=encoding)["fields"][0] == mandarin def test_write_with_mask(tmp_path): From 3362daed71020373f7a7e0b239e9e6627d8f7ad7 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Fri, 5 Apr 2024 20:42:45 -0700 Subject: [PATCH 04/21] Fix detection of encoding for shapefile layers via SQL --- pyogrio/_io.pyx | 48 +++++++++++++++++++++++------- pyogrio/tests/test_geopandas_io.py | 29 ++++++++++++++++++ 2 files changed, 67 insertions(+), 10 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 852adadd..6cf4ef69 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -486,6 +486,9 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer): # read without recoding. Hence, it is up to you to supply the data in the # appropriate encoding. More info: # https://gdal.org/development/rfc/rfc23_ogr_unicode.html#oftstring-oftstringlist-fields + # NOTE: this always returns False for the layer returned by executing SQL, + # even when they indeed support UTF-8; the layer underying the SQL must + # be passed instead return "UTF-8" driver = get_driver(ogr_dataset) @@ -1109,6 +1112,7 @@ def ogr_read( cdef char **fields_c = NULL cdef OGRDataSourceH ogr_dataset = NULL cdef OGRLayerH ogr_layer = NULL + cdef OGRLayerH base_layer = NULL cdef int feature_count = 0 cdef double xmin, ymin, xmax, ymax cdef const char* prev_shape_encoding_c = NULL @@ -1169,6 +1173,11 @@ def ogr_read( dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) + if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": + # Because SHAPE_ENCODING is set above, GDAL will automatically decode + # to UTF-8 + encoding = "UTF-8" + if sql is None: if layer is None: layer = get_default_layer(ogr_dataset) @@ -1180,13 +1189,22 @@ def ogr_read( # Encoding is derived from the user, from the dataset capabilities / type, # or from the system locale - if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": - # Because SHAPE_ENCODING is set above, GDAL will automatically decode - # to UTF-8 - encoding = "UTF-8" + if encoding: + if get_driver(ogr_dataset) == "ESRI Shapefile": + # Because SHAPE_ENCODING is set above, GDAL will automatically decode + # to UTF-8; ignore any encoding set by user + encoding = "UTF-8" else: - encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) + if sql is not None and get_driver(ogr_dataset) == "ESRI Shapefile": + # in order to properly test certain capabilities, we need to have the + # underlying layer referenced by the SQL query + base_layer = GDALDatasetGetLayer(ogr_dataset, 0) + + else: + base_layer = ogr_layer + + encoding = detect_encoding(ogr_dataset, base_layer) fields = get_fields(ogr_layer, encoding) @@ -1319,6 +1337,7 @@ def ogr_open_arrow( cdef const char *where_c = NULL cdef OGRDataSourceH ogr_dataset = NULL cdef OGRLayerH ogr_layer = NULL + cdef OGRLayerH base_layer = NULL cdef char **fields_c = NULL cdef const char *field_c = NULL cdef char **options = NULL @@ -1402,13 +1421,22 @@ def ogr_open_arrow( # Encoding is derived from the user, from the dataset capabilities / type, # or from the system locale - if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": - # Because SHAPE_ENCODING is set above, GDAL will automatically decode - # to UTF-8 - encoding = "UTF-8" + if encoding: + if get_driver(ogr_dataset) == "ESRI Shapefile": + # Because SHAPE_ENCODING is set above, GDAL will automatically decode + # to UTF-8; ignore any encoding set by user + encoding = "UTF-8" else: - encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) + if sql is not None and get_driver(ogr_dataset) == "ESRI Shapefile": + # in order to properly test certain capabilities, we need to have the + # underlying layer referenced by the SQL query + base_layer = GDALDatasetGetLayer(ogr_dataset, 0) + + else: + base_layer = ogr_layer + + encoding = detect_encoding(ogr_dataset, base_layer) fields = get_fields(ogr_layer, encoding, use_arrow=True) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 11c14f0a..897b02ca 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1680,3 +1680,32 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, use_arrow): actual = read_dataframe(output_path, encoding=encoding, use_arrow=use_arrow) assert actual.columns[0] == mandarin assert actual[mandarin].values[0] == mandarin + + +def test_non_utf8_encoding_shapefile_sql(tmp_path, use_arrow): + encoding = "CP936" + + output_path = tmp_path / "test.shp" + + mandarin = "中文" + df = gp.GeoDataFrame( + {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326" + ) + write_dataframe(df, output_path, encoding=encoding) + + actual = read_dataframe( + output_path, + sql=f"select * from test where \"{mandarin}\" = '{mandarin}'", + use_arrow=use_arrow, + ) + assert actual.columns[0] == mandarin + assert actual[mandarin].values[0] == mandarin + + actual = read_dataframe( + output_path, + sql=f"select * from test where \"{mandarin}\" = '{mandarin}'", + encoding=encoding, + use_arrow=use_arrow, + ) + assert actual.columns[0] == mandarin + assert actual[mandarin].values[0] == mandarin From b99103819dae62d33e6849e6bc0f5cbf3b0d3ffd Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Sat, 6 Apr 2024 08:19:29 -0700 Subject: [PATCH 05/21] Expand encoding test cases --- pyogrio/tests/conftest.py | 20 +++++++++++++++ pyogrio/tests/test_geopandas_io.py | 32 ++++++++++-------------- pyogrio/tests/test_raw_io.py | 39 ++++++++++++++---------------- 3 files changed, 51 insertions(+), 40 deletions(-) diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index 76327b4f..4efc9b0e 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -133,3 +133,23 @@ def test_datetime(): @pytest.fixture(scope="session") def test_datetime_tz(): return _data_dir / "test_datetime_tz.geojson" + + +@pytest.fixture( + scope="session", + params=[ + # Nordic + ("CP865", "Å"), + # Japanese + ("CP932", "ホ"), + # Chinese + ("CP936", "中文"), + # ANSI + ("CP1252", "ÿ"), + # Greek + ("CP1253", "Φ"), + ], +) +def encoded_text(request): + """Return tuple with encoding name and very short sample text in that encoding""" + return request.param diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 897b02ca..da7d9f1e 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1614,7 +1614,7 @@ def test_arrow_bool_exception(tmpdir): @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) -def test_non_utf8_encoding_io(tmp_path, ext): +def test_non_utf8_encoding_io(tmp_path, ext, encoded_text): """Verify that we write non-UTF data to the data source IMPORTANT: this may not be valid for the data source and will likely render @@ -1623,14 +1623,11 @@ def test_non_utf8_encoding_io(tmp_path, ext): NOTE: pyarrow cannot handle non-UTF-8 characters in this way """ - encoding = "CP936" + encoding, text = encoded_text output_path = tmp_path / f"test.{ext}" - mandarin = "中文" - df = gp.GeoDataFrame( - {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326" - ) + df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326") write_dataframe(df, output_path, encoding=encoding) # cannot open these files without specifying encoding @@ -1639,33 +1636,30 @@ def test_non_utf8_encoding_io(tmp_path, ext): # must provide encoding to read these properly actual = read_dataframe(output_path, encoding=encoding) - assert actual.columns[0] == mandarin - assert actual[mandarin].values[0] == mandarin + assert actual.columns[0] == text + assert actual[text].values[0] == text -def test_non_utf8_encoding_io_shapefile(tmp_path, use_arrow): - encoding = "CP936" +def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow): + encoding, text = encoded_text output_path = tmp_path / "test.shp" - mandarin = "中文" - df = gp.GeoDataFrame( - {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326" - ) + df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326") write_dataframe(df, output_path, encoding=encoding) # NOTE: GDAL automatically creates a cpg file with the encoding name, which # means that if we read this without specifying the encoding it uses the # correct one actual = read_dataframe(output_path, use_arrow=use_arrow) - assert actual.columns[0] == mandarin - assert actual[mandarin].values[0] == mandarin + assert actual.columns[0] == text + assert actual[text].values[0] == text # verify that if cpg file is not present, that user-provided encoding must be used os.unlink(str(output_path).replace(".shp", ".cpg")) # We will assume ISO-8859-1, which is wrong - miscoded = mandarin.encode("CP936").decode("ISO-8859-1") + miscoded = text.encode(encoding).decode("ISO-8859-1") if use_arrow: # pyarrow cannot decode column name with incorrect encoding @@ -1678,8 +1672,8 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, use_arrow): # If encoding is provided, that should yield correct text actual = read_dataframe(output_path, encoding=encoding, use_arrow=use_arrow) - assert actual.columns[0] == mandarin - assert actual[mandarin].values[0] == mandarin + assert actual.columns[0] == text + assert actual[text].values[0] == text def test_non_utf8_encoding_shapefile_sql(tmp_path, use_arrow): diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 6b255462..0387c368 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1169,25 +1169,23 @@ def test_encoding_io_shapefile(tmp_path, read_encoding, write_encoding): @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) -def test_non_utf8_encoding_io(tmp_path, ext): +def test_non_utf8_encoding_io(tmp_path, ext, encoded_text): """Verify that we write non-UTF data to the data source IMPORTANT: this may not be valid for the data source and will likely render them unusable in other tools, but should successfully roundtrip unless we disable writing using other encodings. """ - - encoding = "CP936" + encoding, text = encoded_text # Point(0, 0) geometry = np.array( [bytes.fromhex("010100000000000000000000000000000000000000")], dtype=object ) - mandarin = "中文" - field_data = [np.array([mandarin], dtype=object)] + field_data = [np.array([text], dtype=object)] - fields = [mandarin] + fields = [text] meta = dict(geometry_type="Point", crs="EPSG:4326", encoding=encoding) filename = tmp_path / f"test.{ext}" @@ -1202,23 +1200,22 @@ def test_non_utf8_encoding_io(tmp_path, ext): # must provide encoding to read these properly actual_meta, _, _, actual_field_data = read(filename, encoding=encoding) - assert actual_meta["fields"][0] == mandarin - assert actual_field_data[0] == mandarin - assert read_info(filename, encoding=encoding)["fields"][0] == mandarin + assert actual_meta["fields"][0] == text + assert actual_field_data[0] == text + assert read_info(filename, encoding=encoding)["fields"][0] == text -def test_non_utf8_encoding_io_shapefile(tmp_path): - encoding = "CP936" +def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text): + encoding, text = encoded_text # Point(0, 0) geometry = np.array( [bytes.fromhex("010100000000000000000000000000000000000000")], dtype=object ) - mandarin = "中文" - field_data = [np.array([mandarin], dtype=object)] + field_data = [np.array([text], dtype=object)] - fields = [mandarin] + fields = [text] meta = dict(geometry_type="Point", crs="EPSG:4326", encoding=encoding) filename = tmp_path / "test.shp" @@ -1228,25 +1225,25 @@ def test_non_utf8_encoding_io_shapefile(tmp_path): # means that if we read this without specifying the encoding it uses the # correct one actual_meta, _, _, actual_field_data = read(filename) - assert actual_meta["fields"][0] == mandarin - assert actual_field_data[0] == mandarin - assert read_info(filename)["fields"][0] == mandarin + assert actual_meta["fields"][0] == text + assert actual_field_data[0] == text + assert read_info(filename)["fields"][0] == text # verify that if cpg file is not present, that user-provided encoding must be used os.unlink(str(filename).replace(".shp", ".cpg")) # We will assume ISO-8859-1, which is wrong bad_meta, _, _, bad_field_data = read(filename) - miscoded = mandarin.encode("CP936").decode("ISO-8859-1") + miscoded = text.encode(encoding).decode("ISO-8859-1") assert bad_meta["fields"][0] == miscoded assert bad_field_data[0] == miscoded assert read_info(filename)["fields"][0] == miscoded # If encoding is provided, that should yield correct text actual_meta, _, _, actual_field_data = read(filename, encoding=encoding) - assert actual_meta["fields"][0] == mandarin - assert actual_field_data[0] == mandarin - assert read_info(filename, encoding=encoding)["fields"][0] == mandarin + assert actual_meta["fields"][0] == text + assert actual_field_data[0] == text + assert read_info(filename, encoding=encoding)["fields"][0] == text def test_write_with_mask(tmp_path): From ecb56f678468b18c8382c5ab004b83bb1abd0aae Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Sat, 6 Apr 2024 13:35:28 -0700 Subject: [PATCH 06/21] Use ANSI code pages for alternative encodings (not DOS) and slightly improve docs --- docs/source/known_issues.md | 15 +++++++++++++-- pyogrio/_io.pyx | 4 ++-- pyogrio/tests/conftest.py | 14 ++++++++++---- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md index 29d390ee..a1a90adf 100644 --- a/docs/source/known_issues.md +++ b/docs/source/known_issues.md @@ -41,10 +41,21 @@ geometries from the data layer. ## Character encoding -Pyogrio supports reading / writing data layers with a defined encoding. However, +Pyogrio supports reading / writing data layers with a defined encoding. Where +possible and the `encoding` option is not specified, GDAL will attempt to +automatically decode from the native encoding to `UTF-8`, and pyogrio will report +that the encoding is `UTF-8` in that case instead of the native encoding. For +[ESRI Shapefiles](https://gdal.org/drivers/vector/shapefile.html#encoding), +GDAL will use the associated `.cpg` file or a code page specified in the `.dbf` +file to infer the native encoding, but may incorrectly assume the native encoding +is `ISO-8859-1` leading to miscoding errors. Most other drivers are assumed to +be in `UTF-8`, but it is possible (in theory) to specify the `encoding` parameter +manually to force conversions to use the specified encoding value. + +However, DataFrames do not currently allow arbitrary metadata, which means that we are currently unable to store encoding information for a data source. Text fields -are read into Python UTF-8 strings. +are read into Python `UTF-8` strings. ## No validation of geometry or field types diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 6cf4ef69..84cf4bb0 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1549,7 +1549,7 @@ def ogr_open_arrow( CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding) if prev_shape_encoding != NULL: - CPLFree(prev_shape_encoding) + CPLFree(prev_shape_encoding) def ogr_read_bounds( str path, @@ -1700,7 +1700,7 @@ def ogr_read_info( CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding) if prev_shape_encoding != NULL: - CPLFree(prev_shape_encoding) + CPLFree(prev_shape_encoding) return meta diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index 4efc9b0e..766d1bed 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -138,18 +138,24 @@ def test_datetime_tz(): @pytest.fixture( scope="session", params=[ - # Nordic - ("CP865", "Å"), # Japanese ("CP932", "ホ"), # Chinese ("CP936", "中文"), - # ANSI + # Central European + ("CP1250", "Đ"), + # Latin 1 / Western European ("CP1252", "ÿ"), # Greek ("CP1253", "Φ"), + # Arabic + ("CP1256", "ش"), ], ) def encoded_text(request): - """Return tuple with encoding name and very short sample text in that encoding""" + """Return tuple with encoding name and very short sample text in that encoding + NOTE: it was determined through testing that code pages for MS-DOS do not + consistently work across all Python installations (in particular, fail with conda), + but ANSI code pages appear to work properly. + """ return request.param From 994be50f0bf1a5c72d10e3f65002d7394263fcc5 Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Sat, 6 Apr 2024 13:48:56 -0700 Subject: [PATCH 07/21] Improve docs a little more --- docs/source/introduction.md | 18 +++++++++++------- docs/source/known_issues.md | 7 ++----- pyogrio/core.py | 4 ++++ pyogrio/geopandas.py | 4 ++-- pyogrio/raw.py | 4 ++-- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/docs/source/introduction.md b/docs/source/introduction.md index e7ac8cf1..c62357e1 100644 --- a/docs/source/introduction.md +++ b/docs/source/introduction.md @@ -89,6 +89,10 @@ first layer. } ``` +NOTE: pyogrio will report `UTF-8` if either the native encoding is likely to be +`UTF-8` or GDAL can automatically convert from the detected native encoding to +`UTF-8`. + To read from a layer using name or index (the following are equivalent): ```python @@ -468,9 +472,9 @@ You can also read from a URL with this syntax: GDAL only supports datetimes at a millisecond resolution. Reading data will thus give at most millisecond resolution (`datetime64[ms]` data type). With pandas 2.0 -`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]` -correspondingly. For previous versions of pandas, `datetime64[ns]` is used as -ms precision was not supported. When writing, only precision up to +`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]` +correspondingly. For previous versions of pandas, `datetime64[ns]` is used as +ms precision was not supported. When writing, only precision up to ms is retained. Not all file formats have dedicated support to store datetime data, like ESRI @@ -478,11 +482,11 @@ Shapefile. For such formats, or if you require precision > ms, a workaround is t convert the datetimes to string. Timezone information is preserved where possible, however GDAL only represents -time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or -`zoneinfo`). This means that dataframes with columns containing multiple offsets +time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or +`zoneinfo`). This means that dataframes with columns containing multiple offsets (e.g. when switching from standard time to summer time) will be written correctly, -but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime -column, as there is no way to reconstruct the original timezone from the individual +but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime +column, as there is no way to reconstruct the original timezone from the individual offsets present. ## Dataset and layer creation options diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md index a1a90adf..1a26efac 100644 --- a/docs/source/known_issues.md +++ b/docs/source/known_issues.md @@ -48,14 +48,11 @@ that the encoding is `UTF-8` in that case instead of the native encoding. For [ESRI Shapefiles](https://gdal.org/drivers/vector/shapefile.html#encoding), GDAL will use the associated `.cpg` file or a code page specified in the `.dbf` file to infer the native encoding, but may incorrectly assume the native encoding -is `ISO-8859-1` leading to miscoding errors. Most other drivers are assumed to +is `ISO-8859-1`, leading to miscoding errors. Most other drivers are assumed to be in `UTF-8`, but it is possible (in theory) to specify the `encoding` parameter manually to force conversions to use the specified encoding value. -However, -DataFrames do not currently allow arbitrary metadata, which means that we are -currently unable to store encoding information for a data source. Text fields -are read into Python `UTF-8` strings. +Field names and values are read into Python `UTF-8` strings. ## No validation of geometry or field types diff --git a/pyogrio/core.py b/pyogrio/core.py index 2d43397d..a354b10d 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -217,6 +217,10 @@ def read_info( driver or if the data source is nonspatial. You can force it to be calculated using the ``force_total_bounds`` parameter. + ``encoding`` will be ``UTF-8`` if either the native encoding is likely to be + ``UTF-8`` or GDAL can automatically convert from the detected native encoding + to ``UTF-8``. + Parameters ---------- path : str or pathlib.Path diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index f647126e..afbdccef 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -106,8 +106,8 @@ def read_dataframe( of the layer in the data source. Defaults to first layer in data source. encoding : str, optional (default: None) If present, will be used as the encoding for reading string values from - the data source, unless encoding can be inferred directly from the data - source. + the data source. By default will automatically try to detect the native + encoding and decode to ``UTF-8``. columns : list-like, optional (default: all columns) List of column names to import from the data source. Column names must exactly match the names in the data source, and will be returned in diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 6499867a..d82e4611 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -67,8 +67,8 @@ def read( of the layer in the data source. Defaults to first layer in data source. encoding : str, optional (default: None) If present, will be used as the encoding for reading string values from - the data source, unless encoding can be inferred directly from the data - source. + the data source. By default will automatically try to detect the native + encoding and decode to ``UTF-8``. columns : list-like, optional (default: all columns) List of column names to import from the data source. Column names must exactly match the names in the data source, and will be returned in From 901320fb3d699438782406f36188b79e9c2e94ea Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 8 Apr 2024 06:48:04 -0700 Subject: [PATCH 08/21] consolidate operations per PR feedback --- pyogrio/_io.pyx | 138 +++++++++++++++++++++++------------------------- 1 file changed, 66 insertions(+), 72 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 84cf4bb0..0db563d5 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -134,6 +134,34 @@ cdef char** dict_to_options(object values): return options +cdef const char* override_threadlocal_config_option(const char* key, const char* value): + """Set the CPLSetThreadLocalConfigOption for key=value + + Parameters + ---------- + key : const char* + python string must decoded to UTF-8 before calling this + value : const char* + python string must decoded to UTF-8 before calling this + + Returns + ------- + const char* + value previously set for key, so that it can be later restored. Caller + is responsible for freeing this via CPLFree() if not NULL. + """ + cdef const char *prev_value = CPLGetThreadLocalConfigOption(key, NULL) + if prev_value != NULL: + # strings returned from config options may be replaced via + # CPLSetConfigOption() below; GDAL instructs us to save a copy + # in a new string + prev_value = CPLStrdup(prev_value) + + CPLSetThreadLocalConfigOption(key, value) + + return prev_value + + cdef void* ogr_open(const char* path_c, int mode, char** options) except NULL: cdef void* ogr_dataset = NULL @@ -486,16 +514,27 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer): # read without recoding. Hence, it is up to you to supply the data in the # appropriate encoding. More info: # https://gdal.org/development/rfc/rfc23_ogr_unicode.html#oftstring-oftstringlist-fields - # NOTE: this always returns False for the layer returned by executing SQL, - # even when they indeed support UTF-8; the layer underying the SQL must - # be passed instead + # NOTE: for shapefiles, this always returns False for the layer returned + # when executing SQL, even when it supports UTF-8 (patched below); + # this may be fixed by https://github.com/OSGeo/gdal/pull/9649 (GDAL >=3.9.0?) return "UTF-8" driver = get_driver(ogr_dataset) if driver == "ESRI Shapefile": - # Typically, OGR_L_TestCapability returns True for OLCStringsAsUTF8 for ESRI - # Shapefile so this won't be reached. However, for old versions of GDAL and/or - # if libraries are missing, this fallback could be needed. + # OGR_L_TestCapability returns True for OLCStringsAsUTF8 (above) for + # shapefiles when a .cpg file is present with a valid encoding, or GDAL + # auto-detects the encoding from the code page of the .dbf file, or + # SHAPE_ENCODING config option is set, or ENCODING layer creation option + # is specified (shapefiles only). Otherwise, we can only assume that + # shapefiles are in their default encoding of ISO-8859-1 (which may be + # incorrect and must be overridden by user-provided encoding) + + # Always use the first layer to test capabilities until detection for + # SQL results from shapefiles are fixed (above) + # This block should only be used for unfixed versions of GDAL (<3.9.0?) + if OGR_L_TestCapability(GDALDatasetGetLayer(ogr_dataset, 0), OLCStringsAsUTF8): + return "UTF-8" + return "ISO-8859-1" if driver == "OSM": @@ -1112,11 +1151,9 @@ def ogr_read( cdef char **fields_c = NULL cdef OGRDataSourceH ogr_dataset = NULL cdef OGRLayerH ogr_layer = NULL - cdef OGRLayerH base_layer = NULL cdef int feature_count = 0 cdef double xmin, ymin, xmax, ymax - cdef const char* prev_shape_encoding_c = NULL - cdef char* prev_shape_encoding = NULL + cdef const char *prev_shape_encoding = NULL cdef bint override_shape_encoding = False path_b = path.encode('utf-8') @@ -1157,18 +1194,9 @@ def ogr_read( # SHAPE_ENCODING so that it can be restored later # (we do this for all data sources where encoding is set because # we don't know the driver until after it is opened, which is too late) - prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL) - if prev_shape_encoding_c != NULL: - # strings returned from config options may be replaced via - # CPLSetConfigOption() below; GDAL instructs us to save a copy - # in a new string - prev_shape_encoding = CPLStrdup(prev_shape_encoding_c) - - # set encoding used automatically by GDAL - encoding_b = encoding.encode('UTF-8') + encoding_b = encoding.encode("UTF-8") encoding_c = encoding_b - CPLSetThreadLocalConfigOption("SHAPE_ENCODING", encoding_c) - + prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c) dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1191,20 +1219,12 @@ def ogr_read( # or from the system locale if encoding: if get_driver(ogr_dataset) == "ESRI Shapefile": - # Because SHAPE_ENCODING is set above, GDAL will automatically decode - # to UTF-8; ignore any encoding set by user + # Because SHAPE_ENCODING is set above, GDAL will automatically + # decode shapefiles to UTF-8; ignore any encoding set by user encoding = "UTF-8" else: - if sql is not None and get_driver(ogr_dataset) == "ESRI Shapefile": - # in order to properly test certain capabilities, we need to have the - # underlying layer referenced by the SQL query - base_layer = GDALDatasetGetLayer(ogr_dataset, 0) - - else: - base_layer = ogr_layer - - encoding = detect_encoding(ogr_dataset, base_layer) + encoding = detect_encoding(ogr_dataset, ogr_layer) fields = get_fields(ogr_layer, encoding) @@ -1302,7 +1322,7 @@ def ogr_read( CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding) if prev_shape_encoding != NULL: - CPLFree(prev_shape_encoding) + CPLFree(prev_shape_encoding) return ( meta, @@ -1337,12 +1357,10 @@ def ogr_open_arrow( cdef const char *where_c = NULL cdef OGRDataSourceH ogr_dataset = NULL cdef OGRLayerH ogr_layer = NULL - cdef OGRLayerH base_layer = NULL cdef char **fields_c = NULL cdef const char *field_c = NULL cdef char **options = NULL - cdef const char* prev_shape_encoding_c = NULL - cdef char* prev_shape_encoding = NULL + cdef const char *prev_shape_encoding = NULL cdef bint override_shape_encoding = False cdef ArrowArrayStream stream cdef ArrowSchema schema @@ -1395,17 +1413,9 @@ def ogr_open_arrow( # SHAPE_ENCODING so that it can be restored later # (we do this for all data sources where encoding is set because # we don't know the driver until after it is opened, which is too late) - prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL) - if prev_shape_encoding_c != NULL: - # strings returned from config options may be replaced via - # CPLSetConfigOption() below; GDAL instructs us to save a copy - # in a new string - prev_shape_encoding = CPLStrdup(prev_shape_encoding_c) - - # set encoding used automatically by GDAL - encoding_b = encoding.encode('UTF-8') + encoding_b = encoding.encode("UTF-8") encoding_c = encoding_b - CPLSetThreadLocalConfigOption("SHAPE_ENCODING", encoding_c) + prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c) dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1423,20 +1433,12 @@ def ogr_open_arrow( # or from the system locale if encoding: if get_driver(ogr_dataset) == "ESRI Shapefile": - # Because SHAPE_ENCODING is set above, GDAL will automatically decode - # to UTF-8; ignore any encoding set by user + # Because SHAPE_ENCODING is set above, GDAL will automatically + # decode shapefiles to UTF-8; ignore any encoding set by user encoding = "UTF-8" else: - if sql is not None and get_driver(ogr_dataset) == "ESRI Shapefile": - # in order to properly test certain capabilities, we need to have the - # underlying layer referenced by the SQL query - base_layer = GDALDatasetGetLayer(ogr_dataset, 0) - - else: - base_layer = ogr_layer - - encoding = detect_encoding(ogr_dataset, base_layer) + encoding = detect_encoding(ogr_dataset, ogr_layer) fields = get_fields(ogr_layer, encoding, use_arrow=True) @@ -1619,7 +1621,7 @@ def ogr_read_info( cdef char **dataset_options = NULL cdef OGRDataSourceH ogr_dataset = NULL cdef OGRLayerH ogr_layer = NULL - cdef const char* prev_shape_encoding = NULL + cdef const char *prev_shape_encoding = NULL cdef bint override_shape_encoding = False path_b = path.encode('utf-8') @@ -1634,18 +1636,9 @@ def ogr_read_info( # SHAPE_ENCODING so that it can be restored later # (we do this for all data sources where encoding is set because # we don't know the driver until after it is opened, which is too late) - prev_shape_encoding_c = CPLGetThreadLocalConfigOption("SHAPE_ENCODING", NULL) - if prev_shape_encoding_c != NULL: - # strings returned from config options may be replaced via - # CPLSetConfigOption() below; GDAL instructs us to save a copy - # in a new string - prev_shape_encoding = CPLStrdup(prev_shape_encoding_c) - - # set encoding used automatically by GDAL - encoding_b = encoding.encode('UTF-8') + encoding_b = encoding.encode("UTF-8") encoding_c = encoding_b - CPLSetThreadLocalConfigOption("SHAPE_ENCODING", encoding_c) - + prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c) dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1657,8 +1650,8 @@ def ogr_read_info( # Encoding is derived from the user, from the dataset capabilities / type, # or from the system locale if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": - # Because SHAPE_ENCODING is set above, GDAL will automatically decode - # to UTF-8 + # Because SHAPE_ENCODING is set above, GDAL will automatically + # decode shapefiles to UTF-8; ignore any encoding set by user encoding = "UTF-8" else: @@ -2127,8 +2120,9 @@ def ogr_write( if driver == 'ESRI Shapefile': - # force encoding for remaining operations to be in UTF-8 because - # GDAL will automatically convert those to the target encoding + # force encoding for remaining operations to be in UTF-8 (even if user + # provides an encoding) because GDAL will automatically convert those to + # the target encoding because ENCODING is set as a layer creation option encoding = "UTF-8" else: From 4313fcaab1226d20456428ebff84d08285a1b3fa Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 8 Apr 2024 08:35:37 -0700 Subject: [PATCH 09/21] verify SHAPE_ENCODING global option is retained --- pyogrio/tests/test_raw_io.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 0387c368..58c73153 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -12,6 +12,7 @@ list_drivers, read_info, set_gdal_config_options, + get_gdal_config_option, __gdal_version__, ) from pyogrio._compat import HAS_SHAPELY @@ -1245,6 +1246,17 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text): assert actual_field_data[0] == text assert read_info(filename, encoding=encoding)["fields"][0] == text + # verify that setting encoding does not corrupt SHAPE_ENCODING option if set + # globally (it is ignored during read when encoding is specified by user) + try: + set_gdal_config_options({"SHAPE_ENCODING": "CP1254"}) + _ = read(filename, encoding=encoding) + assert get_gdal_config_option("SHAPE_ENCODING") == "CP1254" + + finally: + # reset to clear between tests + set_gdal_config_options({"SHAPE_ENCODING": None}) + def test_write_with_mask(tmp_path): # Point(0, 0), null From ef602d51e05f3de8170779f61d0d62dfda18af8a Mon Sep 17 00:00:00 2001 From: "Brendan C. Ward" Date: Mon, 8 Apr 2024 14:42:12 -0700 Subject: [PATCH 10/21] Cleanup duplicate override to UTF-8 --- pyogrio/_io.pyx | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 9beda426..251b0038 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1201,11 +1201,6 @@ def ogr_read( dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) - if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": - # Because SHAPE_ENCODING is set above, GDAL will automatically decode - # to UTF-8 - encoding = "UTF-8" - if sql is None: if layer is None: layer = get_default_layer(ogr_dataset) From 2bdf4dc2e3261efb6df31bb82fe90cbed9365799 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Wed, 17 Apr 2024 14:41:31 -0700 Subject: [PATCH 11/21] prevent combining encoding parameter and ENCODING open / creation option --- CHANGES.md | 3 +- pyogrio/_io.pyx | 30 +++++++++++---- pyogrio/tests/test_geopandas_io.py | 59 ++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 8 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 4e311c3c..d84a87ce 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -33,7 +33,8 @@ the data source not using the Arrow API (#391). - Properly handle decoding of ESRI Shapefiles with user-provided `encoding` option for `read`, `read_dataframe`, and `open_arrow`, and correctly encode - Shapefile field names and text values to the user-provided `encoding` (#384). + Shapefile field names and text values to the user-provided `encoding` for + `write` and `write_dataframe` (#384). ### Packaging diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 5b713b73..5de43e59 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1216,6 +1216,10 @@ def ogr_read( # or from the system locale if encoding: if get_driver(ogr_dataset) == "ESRI Shapefile": + # NOTE: SHAPE_ENCODING is a configuration option whereas ENCODING is the dataset open option + if "ENCODING" in dataset_kwargs: + raise ValueError('cannot provide both encoding parameter and "ENCODING" option; use encoding parameter to specify correct encoding for data source') + # Because SHAPE_ENCODING is set above, GDAL will automatically # decode shapefiles to UTF-8; ignore any encoding set by user encoding = "UTF-8" @@ -1467,10 +1471,17 @@ def ogr_open_arrow( # or from the system locale if encoding: if get_driver(ogr_dataset) == "ESRI Shapefile": + # NOTE: SHAPE_ENCODING is a configuration option whereas ENCODING is the dataset open option + if "ENCODING" in dataset_kwargs: + raise ValueError('cannot provide both encoding parameter and "ENCODING" option; use encoding parameter to specify correct encoding for data source') + # Because SHAPE_ENCODING is set above, GDAL will automatically # decode shapefiles to UTF-8; ignore any encoding set by user encoding = "UTF-8" + elif encoding.replace('-','').upper() != 'UTF8': + raise ValueError("non-UTF-8 encoding is not supported for Arrow; use the non-Arrow interface instead") + else: encoding = detect_encoding(ogr_dataset, ogr_layer) @@ -2078,23 +2089,28 @@ cdef create_ogr_dataset_layer( dataset_options = NULL raise exc - # Setup layer creation options + # Setup other layer creation options + for k, v in layer_kwargs.items(): + k = k.encode('UTF-8') + v = v.encode('UTF-8') + layer_options = CSLAddNameValue(layer_options, k, v) if driver == 'ESRI Shapefile': # ENCODING option must be set for shapefiles to properly write *.cpg # file containing the encoding; this is not a supported option for - # other drivers + # other drivers. This is done after setting general options above + # to override ENCODING if passed by the user as a layer option. if encoding is None: encoding = "UTF-8" + + else: + if "ENCODING" in layer_kwargs: + raise ValueError('cannot provide both encoding parameter and "ENCODING" layer creation option; use the encoding parameter') + encoding_b = encoding.upper().encode('UTF-8') encoding_c = encoding_b layer_options = CSLSetNameValue(layer_options, "ENCODING", encoding_c) - # Setup other layer creation options - for k, v in layer_kwargs.items(): - k = k.encode('UTF-8') - v = v.encode('UTF-8') - layer_options = CSLAddNameValue(layer_options, k, v) ### Get geometry type # TODO: this is brittle for 3D / ZM / M types diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index e925a9c0..2cf39331 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1711,6 +1711,33 @@ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text): assert actual[text].values[0] == text +@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) +def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text): + """Verify that we write non-UTF data to the data source + + IMPORTANT: this may not be valid for the data source and will likely render + them unusable in other tools, but should successfully roundtrip unless we + disable writing using other encodings. + + NOTE: pyarrow cannot handle non-UTF-8 characters in this way + """ + + encoding, text = encoded_text + output_path = tmp_path / f"test.{ext}" + + df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326") + write_dataframe(df, output_path, encoding=encoding) + + # cannot open these files without specifying encoding + with pytest.raises(UnicodeDecodeError): + read_dataframe(output_path) + + with pytest.raises( + ValueError, match="non-UTF-8 encoding is not supported for Arrow" + ): + read_dataframe(output_path, encoding=encoding, use_arrow=True) + + def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow): encoding, text = encoded_text @@ -1746,6 +1773,38 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow): assert actual.columns[0] == text assert actual[text].values[0] == text + # if ENCODING open option, that should yield correct text + actual = read_dataframe(output_path, use_arrow=use_arrow, ENCODING=encoding) + assert actual.columns[0] == text + assert actual[text].values[0] == text + + +def test_encoding_read_option_collision_shapefile(naturalearth_lowres, use_arrow): + """Providing both encoding parameter and ENCODING open option (even if blank) is not allowed""" + + with pytest.raises( + ValueError, match='cannot provide both encoding parameter and "ENCODING" option' + ): + read_dataframe( + naturalearth_lowres, encoding="CP936", ENCODING="", use_arrow=use_arrow + ) + + +def test_encoding_write_layer_option_collision_shapefile(tmp_path, encoded_text): + """Providing both encoding parameter and ENCODING layer creation option (even if blank) is not allowed""" + encoding, text = encoded_text + + output_path = tmp_path / "test.shp" + df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326") + + with pytest.raises( + ValueError, + match='cannot provide both encoding parameter and "ENCODING" layer creation option', + ): + write_dataframe( + df, output_path, encoding=encoding, layer_options={"ENCODING": ""} + ) + def test_non_utf8_encoding_shapefile_sql(tmp_path, use_arrow): encoding = "CP936" From b8cc9024bce92df356a1ad502d77ed7a32f4b8cc Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Wed, 17 Apr 2024 14:48:58 -0700 Subject: [PATCH 12/21] Fix missing pyarrow constraint for test --- pyogrio/tests/test_geopandas_io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 2cf39331..f1610583 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1711,6 +1711,7 @@ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text): assert actual[text].values[0] == text +@requires_pyarrow_api @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text): """Verify that we write non-UTF data to the data source From e7db258182cf7193ab9a08b2bacf39a7327587c8 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Wed, 17 Apr 2024 15:33:51 -0700 Subject: [PATCH 13/21] Try to verify that platform default encoding is used for CSV by default --- pyogrio/tests/test_geopandas_io.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f1610583..6f0030b7 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1,6 +1,9 @@ import contextlib from datetime import datetime +import locale import os +import sys + import numpy as np import pytest @@ -81,6 +84,19 @@ def test_read_csv_encoding(tmp_path, encoding): assert df.city.tolist() == ["Zürich"] assert df.näme.tolist() == ["Wilhelm Röntgen"] + # verify that read defaults to platform encoding + if ( + sys.platform == "win32" + and encoding == "cp1252" + and locale.getpreferredencoding().lower() == encoding + ): + df = read_dataframe(csv_path) + + assert len(df) == 1 + assert df.columns.tolist() == ["näme", "city"] + assert df.city.tolist() == ["Zürich"] + assert df.näme.tolist() == ["Wilhelm Röntgen"] + def test_read_dataframe(naturalearth_lowres_all_ext): df = read_dataframe(naturalearth_lowres_all_ext) From eaecb18978dffaba341a7a1ae77595979d637eb4 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Wed, 17 Apr 2024 15:40:10 -0700 Subject: [PATCH 14/21] split CSV platform encoding test out to verify it executes on Windows --- pyogrio/tests/test_geopandas_io.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 6f0030b7..f570d563 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -2,7 +2,6 @@ from datetime import datetime import locale import os -import sys import numpy as np import pytest @@ -84,18 +83,24 @@ def test_read_csv_encoding(tmp_path, encoding): assert df.city.tolist() == ["Zürich"] assert df.näme.tolist() == ["Wilhelm Röntgen"] - # verify that read defaults to platform encoding - if ( - sys.platform == "win32" - and encoding == "cp1252" - and locale.getpreferredencoding().lower() == encoding - ): - df = read_dataframe(csv_path) - assert len(df) == 1 - assert df.columns.tolist() == ["näme", "city"] - assert df.city.tolist() == ["Zürich"] - assert df.näme.tolist() == ["Wilhelm Röntgen"] +@pytest.mark.skipif( + locale.getpreferredencoding().upper() != "UTF-8", + reason="test requires non-UTF-8 default platform", +) +def test_read_csv_platform_encoding(tmp_path): + """verify that read defaults to platform encoding; only works on Windows (CP1252)""" + csv_path = tmp_path / "test.csv" + with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv: + csv.write("näme,city\n") + csv.write("Wilhelm Röntgen,Zürich\n") + + df = read_dataframe(csv_path) + + assert len(df) == 1 + assert df.columns.tolist() == ["näme", "city"] + assert df.city.tolist() == ["Zürich"] + assert df.näme.tolist() == ["Wilhelm Röntgen"] def test_read_dataframe(naturalearth_lowres_all_ext): From a6235017196b075c049dfb4b0a11fc16a080401e Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Wed, 17 Apr 2024 15:50:33 -0700 Subject: [PATCH 15/21] Fix bug in skip of CSV encoding test --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index f570d563..fcf8e5db 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -85,7 +85,7 @@ def test_read_csv_encoding(tmp_path, encoding): @pytest.mark.skipif( - locale.getpreferredencoding().upper() != "UTF-8", + locale.getpreferredencoding().upper() == "UTF-8", reason="test requires non-UTF-8 default platform", ) def test_read_csv_platform_encoding(tmp_path): From d2b43f34a1b2a5e5d811587496b8306698d9385a Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Thu, 25 Apr 2024 16:35:38 -0700 Subject: [PATCH 16/21] cleanup, add arrow I/O tests --- pyogrio/_io.pyx | 74 +++++++++------------- pyogrio/tests/test_arrow.py | 98 ++++++++++++++++++++++++++++++ pyogrio/tests/test_geopandas_io.py | 9 --- pyogrio/tests/test_raw_io.py | 2 +- 4 files changed, 128 insertions(+), 55 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 20f7a33b..78489657 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -138,15 +138,13 @@ cdef char** dict_to_options(object values): return options -cdef const char* override_threadlocal_config_option(const char* key, const char* value): +cdef const char* override_threadlocal_config_option(str key, str value): """Set the CPLSetThreadLocalConfigOption for key=value Parameters ---------- - key : const char* - python string must decoded to UTF-8 before calling this - value : const char* - python string must decoded to UTF-8 before calling this + key : str + value : str Returns ------- @@ -154,14 +152,22 @@ cdef const char* override_threadlocal_config_option(const char* key, const char* value previously set for key, so that it can be later restored. Caller is responsible for freeing this via CPLFree() if not NULL. """ - cdef const char *prev_value = CPLGetThreadLocalConfigOption(key, NULL) + + key_b = key.encode("UTF-8") + cdef const char* key_c = key_b + + value_b = value.encode("UTF-8") + cdef const char* value_c = value_b + + + cdef const char *prev_value = CPLGetThreadLocalConfigOption(key_c, NULL) if prev_value != NULL: # strings returned from config options may be replaced via # CPLSetConfigOption() below; GDAL instructs us to save a copy # in a new string prev_value = CPLStrdup(prev_value) - CPLSetThreadLocalConfigOption(key, value) + CPLSetThreadLocalConfigOption(key_c, value_c) return prev_value @@ -1191,16 +1197,13 @@ def ogr_read( try: if encoding: - override_shape_encoding = True - # for shapefiles, SHAPE_ENCODING must be set before opening the file # to prevent automatic decoding to UTF-8 by GDAL, so we save previous # SHAPE_ENCODING so that it can be restored later # (we do this for all data sources where encoding is set because # we don't know the driver until after it is opened, which is too late) - encoding_b = encoding.encode("UTF-8") - encoding_c = encoding_b - prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c) + override_shape_encoding = True + prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding) dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1447,15 +1450,7 @@ def ogr_open_arrow( try: if encoding: override_shape_encoding = True - - # for shapefiles, SHAPE_ENCODING must be set before opening the file - # to prevent automatic decoding to UTF-8 by GDAL, so we save previous - # SHAPE_ENCODING so that it can be restored later - # (we do this for all data sources where encoding is set because - # we don't know the driver until after it is opened, which is too late) - encoding_b = encoding.encode("UTF-8") - encoding_c = encoding_b - prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c) + prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding) dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1473,12 +1468,9 @@ def ogr_open_arrow( # or from the system locale if encoding: if get_driver(ogr_dataset) == "ESRI Shapefile": - # NOTE: SHAPE_ENCODING is a configuration option whereas ENCODING is the dataset open option if "ENCODING" in dataset_kwargs: raise ValueError('cannot provide both encoding parameter and "ENCODING" option; use encoding parameter to specify correct encoding for data source') - # Because SHAPE_ENCODING is set above, GDAL will automatically - # decode shapefiles to UTF-8; ignore any encoding set by user encoding = "UTF-8" elif encoding.replace('-','').upper() != 'UTF8': @@ -1652,6 +1644,7 @@ def ogr_open_arrow( if prev_shape_encoding != NULL: CPLFree(prev_shape_encoding) + def ogr_read_bounds( str path, object layer=None, @@ -1730,15 +1723,7 @@ def ogr_read_info( try: if encoding: override_shape_encoding = True - - # for shapefiles, SHAPE_ENCODING must be set before opening the file - # to prevent automatic decoding to UTF-8 by GDAL, so we save previous - # SHAPE_ENCODING so that it can be restored later - # (we do this for all data sources where encoding is set because - # we don't know the driver until after it is opened, which is too late) - encoding_b = encoding.encode("UTF-8") - encoding_c = encoding_b - prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c) + prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding) dataset_options = dict_to_options(dataset_kwargs) ogr_dataset = ogr_open(path_c, 0, dataset_options) @@ -1747,13 +1732,8 @@ def ogr_read_info( layer = get_default_layer(ogr_dataset) ogr_layer = get_ogr_layer(ogr_dataset, layer) - # Encoding is derived from the user, from the dataset capabilities / type, - # or from the system locale if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": - # Because SHAPE_ENCODING is set above, GDAL will automatically - # decode shapefiles to UTF-8; ignore any encoding set by user encoding = "UTF-8" - else: encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) @@ -2102,13 +2082,11 @@ cdef create_ogr_dataset_layer( # file containing the encoding; this is not a supported option for # other drivers. This is done after setting general options above # to override ENCODING if passed by the user as a layer option. - if encoding is None: - encoding = "UTF-8" - - else: - if "ENCODING" in layer_kwargs: - raise ValueError('cannot provide both encoding parameter and "ENCODING" layer creation option; use the encoding parameter') + if encoding and "ENCODING" in layer_kwargs: + raise ValueError('cannot provide both encoding parameter and "ENCODING" layer creation option; use the encoding parameter') + # always write to UTF-8 if encoding is not set + encoding = encoding or "UTF-8" encoding_b = encoding.upper().encode('UTF-8') encoding_c = encoding_b layer_options = CSLSetNameValue(layer_options, "ENCODING", encoding_c) @@ -2226,7 +2204,6 @@ def ogr_write( &ogr_dataset, &ogr_layer, ) - if driver == 'ESRI Shapefile': # force encoding for remaining operations to be in UTF-8 (even if user # provides an encoding) because GDAL will automatically convert those to @@ -2479,10 +2456,17 @@ def ogr_write_arrow( &ogr_dataset, &ogr_layer, ) + # only shapefile supports non-UTF encoding because ENCODING option is set + # during dataset creation and GDAL auto-translates from UTF-8 values to that + # encoding + if encoding and encoding.replace('-','').upper() != 'UTF8' and driver != 'ESRI Shapefile': + raise ValueError("non-UTF-8 encoding is not supported for Arrow; use the non-Arrow interface instead") + if geometry_name: opts = {"GEOMETRY_NAME": geometry_name} else: opts = {} + options = dict_to_options(opts) try: diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index a3b165c5..05b27ac7 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -14,6 +14,7 @@ read_dataframe, read_info, list_layers, + get_gdal_config_option, set_gdal_config_options, ) from pyogrio.raw import open_arrow, read_arrow, write, write_arrow @@ -797,3 +798,100 @@ def test_write_schema_error_message(tmpdir): geometry_type="Point", geometry_name="geometry", ) + + +@requires_arrow_write_api +def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text): + encoding, text = encoded_text + + table = pa.table( + { + # Point(0, 0) + "geometry": pa.array( + [bytes.fromhex("010100000000000000000000000000000000000000")] + ), + text: pa.array([text]), + } + ) + + filename = tmp_path / "test.shp" + write_arrow( + table, + filename, + geometry_type="Point", + geometry_name="geometry", + crs="EPSG:4326", + encoding=encoding, + ) + + # NOTE: GDAL automatically creates a cpg file with the encoding name, which + # means that if we read this without specifying the encoding it uses the + # correct one + schema, table = read_arrow(filename) + assert schema["fields"][0] == text + assert table[text][0].as_py() == text + + # verify that if cpg file is not present, that user-provided encoding must be used + os.unlink(str(filename).replace(".shp", ".cpg")) + + # We will assume ISO-8859-1, which is wrong + miscoded = text.encode(encoding).decode("ISO-8859-1") + bad_schema = read_arrow(filename)[0] + assert bad_schema["fields"][0] == miscoded + # table cannot be decoded to UTF-8 without UnicodeDecodeErrors + + # If encoding is provided, that should yield correct text + schema, table = read_arrow(filename, encoding=encoding) + assert schema["fields"][0] == text + assert table[text][0].as_py() == text + + # verify that setting encoding does not corrupt SHAPE_ENCODING option if set + # globally (it is ignored during read when encoding is specified by user) + try: + set_gdal_config_options({"SHAPE_ENCODING": "CP1254"}) + _ = read_arrow(filename, encoding=encoding) + assert get_gdal_config_option("SHAPE_ENCODING") == "CP1254" + + finally: + # reset to clear between tests + set_gdal_config_options({"SHAPE_ENCODING": None}) + + +@requires_arrow_write_api +def test_encoding_write_layer_option_collision_shapefile(tmpdir, naturalearth_lowres): + """Providing both encoding parameter and ENCODING layer creation option (even if blank) is not allowed""" + + meta, table = read_arrow(naturalearth_lowres) + filename = os.path.join(str(tmpdir), "test.shp") + + with pytest.raises( + ValueError, + match='cannot provide both encoding parameter and "ENCODING" layer creation option', + ): + write_arrow( + table, + filename, + crs=meta["crs"], + geometry_type="MultiPolygon", + geometry_name=meta["geometry_name"] or "wkb_geometry", + encoding="CP936", + layer_options={"ENCODING": ""}, + ) + + +@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) +def test_non_utf8_encoding_io_arrow_exception(tmpdir, naturalearth_lowres, ext): + meta, table = read_arrow(naturalearth_lowres) + filename = os.path.join(str(tmpdir), f"test.{ext}") + + with pytest.raises( + ValueError, match="non-UTF-8 encoding is not supported for Arrow" + ): + write_arrow( + table, + filename, + crs=meta["crs"], + geometry_type="MultiPolygon", + geometry_name=meta["geometry_name"] or "wkb_geometry", + encoding="CP936", + ) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index fcf8e5db..72fa5c3c 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1735,15 +1735,6 @@ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text): @requires_pyarrow_api @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text): - """Verify that we write non-UTF data to the data source - - IMPORTANT: this may not be valid for the data source and will likely render - them unusable in other tools, but should successfully roundtrip unless we - disable writing using other encodings. - - NOTE: pyarrow cannot handle non-UTF-8 characters in this way - """ - encoding, text = encoded_text output_path = tmp_path / f"test.{ext}" diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index 6eb1acaa..d1526ee3 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1237,8 +1237,8 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text): os.unlink(str(filename).replace(".shp", ".cpg")) # We will assume ISO-8859-1, which is wrong - bad_meta, _, _, bad_field_data = read(filename) miscoded = text.encode(encoding).decode("ISO-8859-1") + bad_meta, _, _, bad_field_data = read(filename) assert bad_meta["fields"][0] == miscoded assert bad_field_data[0] == miscoded assert read_info(filename)["fields"][0] == miscoded From 815620a64168fc0cfe17392b98ceb3a91ce3d232 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Thu, 25 Apr 2024 17:54:24 -0700 Subject: [PATCH 17/21] Fix missing test annotation --- pyogrio/tests/test_arrow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index 05b27ac7..225fbb57 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -879,6 +879,7 @@ def test_encoding_write_layer_option_collision_shapefile(tmpdir, naturalearth_lo ) +@requires_arrow_write_api @pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) def test_non_utf8_encoding_io_arrow_exception(tmpdir, naturalearth_lowres, ext): meta, table = read_arrow(naturalearth_lowres) From 6eca2651945ffb0f25ac5cd33cc04d36959c88ea Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Thu, 25 Apr 2024 18:03:47 -0700 Subject: [PATCH 18/21] Don't fail in error handler if message cannot be decoded to UTF-8 --- pyogrio/_err.pyx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pyogrio/_err.pyx b/pyogrio/_err.pyx index 089042c5..aa8461af 100644 --- a/pyogrio/_err.pyx +++ b/pyogrio/_err.pyx @@ -153,9 +153,13 @@ cdef inline object exc_check(): else: # Reformat messages. msg_b = err_msg - msg = msg_b.decode('utf-8') - msg = msg.replace("`", "'") - msg = msg.replace("\n", " ") + + try: + msg = msg_b.decode('utf-8') + msg = msg.replace("`", "'") + msg = msg.replace("\n", " ") + except UnicodeDecodeError as exc: + msg = f"Could not decode error message to UTF-8. Raw error bytes: {msg}" if err_type == 3: CPLErrorReset() From 34bacc31a001c7a7b73755566b81076eb8fbb1f2 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Thu, 25 Apr 2024 18:06:28 -0700 Subject: [PATCH 19/21] Fix bug in attempted fix --- pyogrio/_err.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/_err.pyx b/pyogrio/_err.pyx index aa8461af..04097c7a 100644 --- a/pyogrio/_err.pyx +++ b/pyogrio/_err.pyx @@ -159,7 +159,7 @@ cdef inline object exc_check(): msg = msg.replace("`", "'") msg = msg.replace("\n", " ") except UnicodeDecodeError as exc: - msg = f"Could not decode error message to UTF-8. Raw error bytes: {msg}" + msg = f"Could not decode error message to UTF-8. Raw error bytes: {msg_b}" if err_type == 3: CPLErrorReset() From e088311dad7313a7d8afb17183ba32d04b714537 Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Thu, 25 Apr 2024 18:19:59 -0700 Subject: [PATCH 20/21] Fix failing test for GDAL >= 3.9 --- pyogrio/_err.pyx | 2 +- pyogrio/tests/test_geopandas_io.py | 4 +++- pyogrio/tests/test_raw_io.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pyogrio/_err.pyx b/pyogrio/_err.pyx index 04097c7a..51f2fcfb 100644 --- a/pyogrio/_err.pyx +++ b/pyogrio/_err.pyx @@ -159,7 +159,7 @@ cdef inline object exc_check(): msg = msg.replace("`", "'") msg = msg.replace("\n", " ") except UnicodeDecodeError as exc: - msg = f"Could not decode error message to UTF-8. Raw error bytes: {msg_b}" + msg = f"Could not decode error message to UTF-8. Raw error: {msg_b}" if err_type == 3: CPLErrorReset() diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 72fa5c3c..a6360d4c 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1705,7 +1705,7 @@ def test_arrow_bool_exception(tmpdir, ext): _ = read_dataframe(filename, use_arrow=True) -@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) +@pytest.mark.parametrize("ext", ["gpkg", "geojson"]) def test_non_utf8_encoding_io(tmp_path, ext, encoded_text): """Verify that we write non-UTF data to the data source @@ -1713,6 +1713,8 @@ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text): them unusable in other tools, but should successfully roundtrip unless we disable writing using other encodings. + NOTE: FlatGeobuff driver cannot handle non-UTF data in GDAL >= 3.9 + NOTE: pyarrow cannot handle non-UTF-8 characters in this way """ diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index d1526ee3..915c9438 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1172,13 +1172,15 @@ def test_encoding_io_shapefile(tmp_path, read_encoding, write_encoding): ) -@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) +@pytest.mark.parametrize("ext", ["gpkg", "geojson"]) def test_non_utf8_encoding_io(tmp_path, ext, encoded_text): """Verify that we write non-UTF data to the data source IMPORTANT: this may not be valid for the data source and will likely render them unusable in other tools, but should successfully roundtrip unless we disable writing using other encodings. + + NOTE: FlatGeobuff driver cannot handle non-UTF data in GDAL >= 3.9 """ encoding, text = encoded_text From 58d1d3f5c681fc0131259d75735fc7e80b0dd1bc Mon Sep 17 00:00:00 2001 From: Brendan Ward Date: Thu, 25 Apr 2024 18:24:10 -0700 Subject: [PATCH 21/21] Fix other failing FlatGeobuff tests --- pyogrio/tests/test_arrow.py | 2 +- pyogrio/tests/test_geopandas_io.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index 225fbb57..9c134d05 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -880,7 +880,7 @@ def test_encoding_write_layer_option_collision_shapefile(tmpdir, naturalearth_lo @requires_arrow_write_api -@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) +@pytest.mark.parametrize("ext", ["gpkg", "geojson"]) def test_non_utf8_encoding_io_arrow_exception(tmpdir, naturalearth_lowres, ext): meta, table = read_arrow(naturalearth_lowres) filename = os.path.join(str(tmpdir), f"test.{ext}") diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index a6360d4c..dbda1173 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1735,7 +1735,7 @@ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text): @requires_pyarrow_api -@pytest.mark.parametrize("ext", ["fgb", "gpkg", "geojson"]) +@pytest.mark.parametrize("ext", ["gpkg", "geojson"]) def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text): encoding, text = encoded_text output_path = tmp_path / f"test.{ext}"