From 858597e18ac5c47c3b81192810e2c203af0cc8ad Mon Sep 17 00:00:00 2001 From: shrektan Date: Sun, 11 Oct 2020 22:16:44 +0800 Subject: [PATCH 1/9] convert the fread error message to native encoding in order to display message correctly in R --- src/fread.c | 2 +- src/fread.h | 4 ++++ src/freadR.c | 2 +- src/freadR.h | 1 + 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/fread.c b/src/fread.c index 5b9bac3f0..7b1ba6df0 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2575,7 +2575,7 @@ int freadMain(freadMainArgs _args) { if (ch==eof) { // whitespace at the end of the file is always skipped ok } else { - const char *skippedFooter = ch; + const char *skippedFooter = ENC2NATIVE(ch); // detect if it's a single line footer. Commonly the row count from SQL queries. while (ch +#include #include "po.h" #define FREAD_MAIN_ARGS_EXTRA_FIELDS \ From ebdb055eb181ff3850bfbb5a8520ef93a5c32c72 Mon Sep 17 00:00:00 2001 From: shrektan Date: Sun, 11 Oct 2020 22:19:51 +0800 Subject: [PATCH 2/9] tweak the comment --- src/fread.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fread.h b/src/fread.h index b2dbd01af..0f365511c 100644 --- a/src/fread.h +++ b/src/fread.h @@ -11,7 +11,8 @@ #else #include "freadR.h" extern cetype_t ienc; - // must convert the error message char to native encoding first in order to correctly display in R + // R's message functions only take C's char pointer not SEXP, where encoding info can't be stored + // so must convert the error message char to native encoding first in order to correctly display in R #define ENC2NATIVE(s) translateChar(mkCharCE(s, ienc)) #endif From cbe834c139b32a9c10a06c359e951bf6e50e3fcd Mon Sep 17 00:00:00 2001 From: shrektan Date: Sun, 11 Oct 2020 22:23:19 +0800 Subject: [PATCH 3/9] add tests --- inst/tests/tests.Rraw | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e4dbcb480..51da36f40 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17150,3 +17150,14 @@ test(2154.1, fread("0.0\n", colClasses="integer"), data.table(V1=0.0), test(2154.2, fread("A\n0.0\n", colClasses="integer"), data.table(A=0.0), warning="Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.*please") +# fread message display non-ASCII messages correctly, #4747 +x = "fa\xE7ile" +Encoding(x) = "latin1" +txt = sprintf("A,B\n%s,%s\n%s", x, x, x) +Encoding(txt) = "UTF-8" +txt2 = iconv(txt, "UTF-8", "latin1") +out = data.table(A = x, B = x) +test(2155.1, fread(text = txt, encoding = 'UTF-8'), out, + warning="Discarded single-line footer: <>") +test(2155.2, fread(text = txt2, encoding = 'Latin-1'), out, + warning="Discarded single-line footer: <>") From 41141c5d1cba01da67f0dd087650a9f2f087a222 Mon Sep 17 00:00:00 2001 From: shrektan Date: Sun, 11 Oct 2020 22:27:48 +0800 Subject: [PATCH 4/9] only run this test if native encoding can represent Latin1 text --- inst/tests/tests.Rraw | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 51da36f40..c98652cee 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17151,13 +17151,15 @@ test(2154.2, fread("A\n0.0\n", colClasses="integer"), data.table(A=0.0), warning="Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.*please") # fread message display non-ASCII messages correctly, #4747 -x = "fa\xE7ile" -Encoding(x) = "latin1" -txt = sprintf("A,B\n%s,%s\n%s", x, x, x) -Encoding(txt) = "UTF-8" -txt2 = iconv(txt, "UTF-8", "latin1") -out = data.table(A = x, B = x) -test(2155.1, fread(text = txt, encoding = 'UTF-8'), out, - warning="Discarded single-line footer: <>") -test(2155.2, fread(text = txt2, encoding = 'Latin-1'), out, - warning="Discarded single-line footer: <>") +x = "fa\xE7ile"; Encoding(x) = "latin1" +# should only run this test if the native encoding can represent latin1 correctly +if (identical(x, enc2native(x))) { + txt = sprintf("A,B\n%s,%s\n%s", x, x, x) + Encoding(txt) = "UTF-8" + txt2 = iconv(txt, "UTF-8", "latin1") + out = data.table(A = x, B = x) + test(2155.1, fread(text = txt, encoding = 'UTF-8'), out, + warning="Discarded single-line footer: <>") + test(2155.2, fread(text = txt2, encoding = 'Latin-1'), out, + warning="Discarded single-line footer: <>") +} From 77f48d0288ee70459a70aa1a2374dc928f0e035f Mon Sep 17 00:00:00 2001 From: shrektan Date: Tue, 13 Oct 2020 02:15:15 +0800 Subject: [PATCH 5/9] should have used the UTF-8 escaped raw text --- inst/tests/tests.Rraw | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c98652cee..21a1d1474 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17151,11 +17151,10 @@ test(2154.2, fread("A\n0.0\n", colClasses="integer"), data.table(A=0.0), warning="Attempt to override column 1 <> of inherent type 'float64' down to 'int32' ignored.*please") # fread message display non-ASCII messages correctly, #4747 -x = "fa\xE7ile"; Encoding(x) = "latin1" +x = "fa\u00e7ile"; Encoding(x) = "UTF-8" # should only run this test if the native encoding can represent latin1 correctly if (identical(x, enc2native(x))) { - txt = sprintf("A,B\n%s,%s\n%s", x, x, x) - Encoding(txt) = "UTF-8" + txt = enc2utf8(sprintf("A,B\n%s,%s\n%s", x, x, x)) txt2 = iconv(txt, "UTF-8", "latin1") out = data.table(A = x, B = x) test(2155.1, fread(text = txt, encoding = 'UTF-8'), out, From 389002f1ff75846af2ceb42f0a172bbf504e1954 Mon Sep 17 00:00:00 2001 From: shrektan Date: Tue, 13 Oct 2020 21:35:58 +0800 Subject: [PATCH 6/9] Add the news item --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 1f2dbf88a..9124fa1b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,8 @@ 5. `dplyr::mutate(setDT(as.list(1:64)), V1=11)` threw error `can't set ALTREP truelength`, [#4734](https://github.com/Rdatatable/data.table/issues/4734). Thanks to @etryn for the reproducible example, and to Cole Miller for refinements. +6. `fread()` now throws correct non-ASCII error messages on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. + ## NOTES 1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accomodate `bit64`'s update. From 59ef4bbde581c7b52be8752e1146447a3a658173 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 3 Jan 2021 19:46:07 -0700 Subject: [PATCH 7/9] news item tweak --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 673bcd90b..3ebbe7561 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ ## BUG FIXES +1. `fread()` messages containing non-ASCII exhibits from the file now display correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. + ## NOTES @@ -57,8 +59,6 @@ 5. `dplyr::mutate(setDT(as.list(1:64)), V1=11)` threw error `can't set ALTREP truelength`, [#4734](https://github.com/Rdatatable/data.table/issues/4734). Thanks to @etryn for the reproducible example, and to Cole Miller for refinements. -6. `fread()` now throws correct non-ASCII error messages on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. - ## NOTES 1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accommodate `bit64`'s update. From 74a7f6df3a796c4f817c563b983107dad9a90156 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 4 Jan 2021 12:39:38 -0700 Subject: [PATCH 8/9] news item tweak This PR addresses the footer warning, not all messages. So making that clear in the news item. Maybe future PRs will address other messages which also include exhibits from the file. --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 3ebbe7561..44fb1d49a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ ## BUG FIXES -1. `fread()` messages containing non-ASCII exhibits from the file now display correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. +1. If `fread()` discards a single line footer that contains non-ASCII characters, the warning message now displays correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. ## NOTES From 7f3ae4f88e5308f87d1358b213c8bb951b29ebf5 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 4 Jan 2021 12:53:54 -0700 Subject: [PATCH 9/9] news item tweak to avoid any interpretation that the non-ASCII characters might cause the warning message --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 44fb1d49a..e0ea1222e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ ## BUG FIXES -1. If `fread()` discards a single line footer that contains non-ASCII characters, the warning message now displays correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. +1. If `fread()` discards a single line footer, the warning message which includes the discarded text now displays any non-ASCII characters correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. ## NOTES