From 2785ff054a346636c474a598602ef894a2e029ad Mon Sep 17 00:00:00 2001 From: dylni <46035563+dylni@users.noreply.github.com> Date: Sun, 20 Nov 2022 21:14:10 -0500 Subject: [PATCH] Fix incorrect UTF-8 detection --- src/windows/wtf8/code_points.rs | 2 +- tests/integration.rs | 56 +++++++++++---------------------- 2 files changed, 20 insertions(+), 38 deletions(-) diff --git a/src/windows/wtf8/code_points.rs b/src/windows/wtf8/code_points.rs index 15e8753..9800d78 100644 --- a/src/windows/wtf8/code_points.rs +++ b/src/windows/wtf8/code_points.rs @@ -105,9 +105,9 @@ where // This condition is optimized to detect surrogate code points. } else if code_point & 0xFE0 == 0x360 { + self.still_utf8 = false; if code_point & 0x10 == 0 { self.surrogate = true; - self.still_utf8 = false; } else if prev_surrogate { // Decoding a broken surrogate pair would be lossy. invalid = true; diff --git a/tests/integration.rs b/tests/integration.rs index 3cdca8b..3cbe793 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -6,15 +6,11 @@ mod common; use common::Result; use common::WTF8_STRING; -const INVALID_STRING: &[u8] = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz"; - -const UTF8_STRING: &str = "string"; - -fn test_string_is_invalid_utf8(string: &[u8]) { +fn assert_string_is_invalid_utf8(string: &[u8]) { assert!(str::from_utf8(string).is_err()); } -fn test_invalid_result(result: &Result<()>) { +fn assert_invalid_result(result: &Result<()>) { if cfg!(windows) { assert!(result.is_err()); } else { @@ -23,51 +19,37 @@ fn test_invalid_result(result: &Result<()>) { } #[test] -fn test_empty_bytes() { +fn test_empty() { common::test_utf8_bytes(""); -} - -#[test] -fn test_empty_vec() { common::test_utf8_vec(""); } #[test] -fn test_nonempty_utf8_bytes() { - common::test_utf8_bytes(UTF8_STRING); -} +fn test_nonempty_utf8() { + const UTF8_STRING: &str = "string"; -#[test] -fn test_nonempty_utf8_vec() { + common::test_utf8_bytes(UTF8_STRING); common::test_utf8_vec(UTF8_STRING); } #[test] -fn test_invalid_string_is_invalid_utf8() { - test_string_is_invalid_utf8(INVALID_STRING); -} +fn test_invalid() { + const INVALID_STRING: &[u8] = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz"; + assert_string_is_invalid_utf8(INVALID_STRING); -#[test] -fn test_invalid_bytes() { - test_invalid_result(&common::test_bytes(INVALID_STRING)); + assert_invalid_result(&common::test_bytes(INVALID_STRING)); + assert_invalid_result(&common::test_vec(INVALID_STRING)); } #[test] -fn test_invalid_vec() { - test_invalid_result(&common::test_vec(INVALID_STRING)); -} +fn test_wtf8() { + const HIGH_SURROGATE: &[u8] = b"\xED\xA0\x80"; + const LOW_SURROGATE: &[u8] = b"\xED\xB0\x80"; -#[test] -fn test_wtf8_string_is_invalid_utf8() { - test_string_is_invalid_utf8(WTF8_STRING); -} + for string in [WTF8_STRING, HIGH_SURROGATE, LOW_SURROGATE] { + assert_string_is_invalid_utf8(string); -#[test] -fn test_wtf8_bytes() { - assert_eq!(Ok(()), common::test_bytes(WTF8_STRING)); -} - -#[test] -fn test_wtf8_vec() { - assert_eq!(Ok(()), common::test_vec(WTF8_STRING)); + assert_eq!(Ok(()), common::test_bytes(string)); + assert_eq!(Ok(()), common::test_vec(string)); + } }