From d3dede69627d5b9e4e378dbe3953c14b0bed2227 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 12 Mar 2019 14:49:17 +0300 Subject: [PATCH 1/5] Fix gh-15086 properly instead of making a workaround --- pandas/_libs/parsers.pyx | 6 +----- pandas/_libs/src/parser/io.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 36c4c752206a8..18959b2d37b7f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -678,11 +678,7 @@ cdef class TextReader: if isinstance(source, basestring): if not isinstance(source, bytes): - if compat.PY36 and compat.is_platform_windows(): - # see gh-15086. - encoding = "mbcs" - else: - encoding = sys.getfilesystemencoding() or "utf-8" + encoding = sys.getfilesystemencoding() or "utf-8" source = source.encode(encoding) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index f578ce138e274..acb64ef0af7bd 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -17,6 +17,11 @@ The full license is in the LICENSE file, distributed with this software. #define O_BINARY 0 #endif // O_BINARY +#if PY_VERSION_HEX >= 0x03060000 && defined(_WIN32) +#define USE_WIN_UTF16 +#include +#endif + /* On-disk FILE, uncompressed */ @@ -27,7 +32,34 @@ void *new_file_source(char *fname, size_t buffer_size) { return NULL; } +#ifdef USE_WIN_UTF16 + // Fix gh-15086 properly - convert UTF8 to UTF16 that Windows widechar API + // accepts. This is needed because UTF8 might _not_ be convertible to MBCS + // for some conditions, as MBCS is locale-dependent, and not all unicode + // symbols can be expressed in it. + { + wchar_t* wname = NULL; + int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); + if (required == 0) { + free(fs); + return NULL; + } + wname = (wchar_t*)malloc(required * sizeof(wchar_t)); + if (wname == NULL) { + free(fs); + return NULL; + } + if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) < required) { + free(wname); + free(fs); + return NULL; + } + fs->fd = _wopen(wname, O_RDONLY | O_BINARY); + free(wname); + } +#else fs->fd = open(fname, O_RDONLY | O_BINARY); +#endif if (fs->fd == -1) { free(fs); return NULL; From 810416e04bd99e1ed0d26b2f034f0c390b4e2bab Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 15 Mar 2019 21:57:32 +0300 Subject: [PATCH 2/5] fix code style --- pandas/_libs/src/parser/io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index acb64ef0af7bd..5d73230f32955 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -49,7 +49,8 @@ void *new_file_source(char *fname, size_t buffer_size) { free(fs); return NULL; } - if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) < required) { + if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) < + required) { free(wname); free(fs); return NULL; From 8fd2214524bbff1d8e0de3cf9b501a5708b9c7ca Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 19 Mar 2019 17:56:24 +0300 Subject: [PATCH 3/5] Make sure test_filename_with_special_chars properly tests combinations of chars Updated whatsnew --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/tests/io/parser/test_common.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 73eb6a15a1b47..25f2033181c8b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -175,6 +175,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in `TextReader()` not properly working with UTF8 on Windows on Python 3.6+ (fix for :issue:`15086` instead of a workaround) Categorical ^^^^^^^^^^^ diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 05da171d7dc31..9e776c2c15c0b 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -5,6 +5,7 @@ specific classification into the other test modules. """ +import sys import codecs from collections import OrderedDict import csv @@ -1904,12 +1905,21 @@ def test_suppress_error_output(all_parsers, capsys): assert captured.err == "" +def __should_skip_utf8_test(): + if compat.is_platform_windows(): + import ctypes + ansi_codepage = ctypes.cdll.kernel32.GetACP() + return ansi_codepage != 1252 and sys.version_info < (3, 6) + return False + +@pytest.mark.skipif(__should_skip_utf8_test(), + reason="Python < 3.6 won't pass on non-1252 codepage") def test_filename_with_special_chars(all_parsers): # see gh-15086. parser = all_parsers df = DataFrame({"a": [1, 2, 3]}) - with tm.ensure_clean("sé-es-vé.csv") as path: + with tm.ensure_clean("sé-es-vé-sй.csv") as path: df.to_csv(path, index=False) result = parser.read_csv(path) From 7961889162368a79dfd94d33f15aac58e2ad7b03 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Tue, 19 Mar 2019 18:16:33 +0300 Subject: [PATCH 4/5] Address comments by @jreback --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/tests/io/parser/test_common.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 25f2033181c8b..807cad8c8eec8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -175,7 +175,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Bug in `TextReader()` not properly working with UTF8 on Windows on Python 3.6+ (fix for :issue:`15086` instead of a workaround) Categorical ^^^^^^^^^^^ @@ -272,6 +271,7 @@ I/O - Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) +- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) - diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 9e776c2c15c0b..4ca8dd34cd175 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -5,8 +5,8 @@ specific classification into the other test modules. """ -import sys import codecs +import ctypes from collections import OrderedDict import csv from datetime import datetime @@ -1905,15 +1905,14 @@ def test_suppress_error_output(all_parsers, capsys): assert captured.err == "" -def __should_skip_utf8_test(): +def __windows_ansi_encoding_not_cp1252(): if compat.is_platform_windows(): - import ctypes ansi_codepage = ctypes.cdll.kernel32.GetACP() - return ansi_codepage != 1252 and sys.version_info < (3, 6) + return ansi_codepage != 1252 and not compat.PY36 return False -@pytest.mark.skipif(__should_skip_utf8_test(), - reason="Python < 3.6 won't pass on non-1252 codepage") +@pytest.mark.skipif(__windows_ansi_encoding_not_cp1252(), + reason="On Python < 3.6 won't pass on non-1252 codepage") def test_filename_with_special_chars(all_parsers): # see gh-15086. parser = all_parsers From 6095ea3c81786f856b48133db68fa09beb1d1561 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Wed, 20 Mar 2019 13:00:47 +0300 Subject: [PATCH 5/5] Parametrize test_filename_with_special_chars Use CP-1252 and CP-1251 filenames separately, skip the test on Windows on < 3.6 as it won't pass --- pandas/tests/io/parser/test_common.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 4ca8dd34cd175..9060543f1a373 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -6,7 +6,6 @@ """ import codecs -import ctypes from collections import OrderedDict import csv from datetime import datetime @@ -1905,20 +1904,15 @@ def test_suppress_error_output(all_parsers, capsys): assert captured.err == "" -def __windows_ansi_encoding_not_cp1252(): - if compat.is_platform_windows(): - ansi_codepage = ctypes.cdll.kernel32.GetACP() - return ansi_codepage != 1252 and not compat.PY36 - return False - -@pytest.mark.skipif(__windows_ansi_encoding_not_cp1252(), - reason="On Python < 3.6 won't pass on non-1252 codepage") -def test_filename_with_special_chars(all_parsers): +@pytest.mark.skipif(compat.is_platform_windows() and not compat.PY36, + reason="On Python < 3.6 won't pass on Windows") +@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv"]) +def test_filename_with_special_chars(all_parsers, filename): # see gh-15086. parser = all_parsers df = DataFrame({"a": [1, 2, 3]}) - with tm.ensure_clean("sé-es-vé-sй.csv") as path: + with tm.ensure_clean(filename) as path: df.to_csv(path, index=False) result = parser.read_csv(path)