diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 73eb6a15a1b47..807cad8c8eec8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -271,6 +271,7 @@ I/O - Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) +- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) - diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 36c4c752206a8..18959b2d37b7f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -678,11 +678,7 @@ cdef class TextReader: if isinstance(source, basestring): if not isinstance(source, bytes): - if compat.PY36 and compat.is_platform_windows(): - # see gh-15086. - encoding = "mbcs" - else: - encoding = sys.getfilesystemencoding() or "utf-8" + encoding = sys.getfilesystemencoding() or "utf-8" source = source.encode(encoding) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index f578ce138e274..5d73230f32955 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -17,6 +17,11 @@ The full license is in the LICENSE file, distributed with this software. #define O_BINARY 0 #endif // O_BINARY +#if PY_VERSION_HEX >= 0x03060000 && defined(_WIN32) +#define USE_WIN_UTF16 +#include +#endif + /* On-disk FILE, uncompressed */ @@ -27,7 +32,35 @@ void *new_file_source(char *fname, size_t buffer_size) { return NULL; } +#ifdef USE_WIN_UTF16 + // Fix gh-15086 properly - convert UTF8 to UTF16 that Windows widechar API + // accepts. This is needed because UTF8 might _not_ be convertible to MBCS + // for some conditions, as MBCS is locale-dependent, and not all unicode + // symbols can be expressed in it. + { + wchar_t* wname = NULL; + int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); + if (required == 0) { + free(fs); + return NULL; + } + wname = (wchar_t*)malloc(required * sizeof(wchar_t)); + if (wname == NULL) { + free(fs); + return NULL; + } + if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) < + required) { + free(wname); + free(fs); + return NULL; + } + fs->fd = _wopen(wname, O_RDONLY | O_BINARY); + free(wname); + } +#else fs->fd = open(fname, O_RDONLY | O_BINARY); +#endif if (fs->fd == -1) { free(fs); return NULL; diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 05da171d7dc31..9060543f1a373 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1904,12 +1904,15 @@ def test_suppress_error_output(all_parsers, capsys): assert captured.err == "" -def test_filename_with_special_chars(all_parsers): +@pytest.mark.skipif(compat.is_platform_windows() and not compat.PY36, + reason="On Python < 3.6 won't pass on Windows") +@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv"]) +def test_filename_with_special_chars(all_parsers, filename): # see gh-15086. parser = all_parsers df = DataFrame({"a": [1, 2, 3]}) - with tm.ensure_clean("sé-es-vé.csv") as path: + with tm.ensure_clean(filename) as path: df.to_csv(path, index=False) result = parser.read_csv(path)