From 2785ff054a346636c474a598602ef894a2e029ad Mon Sep 17 00:00:00 2001
From: dylni <46035563+dylni@users.noreply.github.com>
Date: Sun, 20 Nov 2022 21:14:10 -0500
Subject: [PATCH] Fix incorrect UTF-8 detection

---
 src/windows/wtf8/code_points.rs |  2 +-
 tests/integration.rs            | 56 +++++++++++----------------------
 2 files changed, 20 insertions(+), 38 deletions(-)

diff --git a/src/windows/wtf8/code_points.rs b/src/windows/wtf8/code_points.rs
index 15e8753..9800d78 100644
--- a/src/windows/wtf8/code_points.rs
+++ b/src/windows/wtf8/code_points.rs
@@ -105,9 +105,9 @@ where
 
                 // This condition is optimized to detect surrogate code points.
                 } else if code_point & 0xFE0 == 0x360 {
+                    self.still_utf8 = false;
                     if code_point & 0x10 == 0 {
                         self.surrogate = true;
-                        self.still_utf8 = false;
                     } else if prev_surrogate {
                         // Decoding a broken surrogate pair would be lossy.
                         invalid = true;
diff --git a/tests/integration.rs b/tests/integration.rs
index 3cdca8b..3cbe793 100644
--- a/tests/integration.rs
+++ b/tests/integration.rs
@@ -6,15 +6,11 @@ mod common;
 use common::Result;
 use common::WTF8_STRING;
 
-const INVALID_STRING: &[u8] = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
-
-const UTF8_STRING: &str = "string";
-
-fn test_string_is_invalid_utf8(string: &[u8]) {
+fn assert_string_is_invalid_utf8(string: &[u8]) {
     assert!(str::from_utf8(string).is_err());
 }
 
-fn test_invalid_result(result: &Result<()>) {
+fn assert_invalid_result(result: &Result<()>) {
     if cfg!(windows) {
         assert!(result.is_err());
     } else {
@@ -23,51 +19,37 @@ fn test_invalid_result(result: &Result<()>) {
 }
 
 #[test]
-fn test_empty_bytes() {
+fn test_empty() {
     common::test_utf8_bytes("");
-}
-
-#[test]
-fn test_empty_vec() {
     common::test_utf8_vec("");
 }
 
 #[test]
-fn test_nonempty_utf8_bytes() {
-    common::test_utf8_bytes(UTF8_STRING);
-}
+fn test_nonempty_utf8() {
+    const UTF8_STRING: &str = "string";
 
-#[test]
-fn test_nonempty_utf8_vec() {
+    common::test_utf8_bytes(UTF8_STRING);
     common::test_utf8_vec(UTF8_STRING);
 }
 
 #[test]
-fn test_invalid_string_is_invalid_utf8() {
-    test_string_is_invalid_utf8(INVALID_STRING);
-}
+fn test_invalid() {
+    const INVALID_STRING: &[u8] = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
+    assert_string_is_invalid_utf8(INVALID_STRING);
 
-#[test]
-fn test_invalid_bytes() {
-    test_invalid_result(&common::test_bytes(INVALID_STRING));
+    assert_invalid_result(&common::test_bytes(INVALID_STRING));
+    assert_invalid_result(&common::test_vec(INVALID_STRING));
 }
 
 #[test]
-fn test_invalid_vec() {
-    test_invalid_result(&common::test_vec(INVALID_STRING));
-}
+fn test_wtf8() {
+    const HIGH_SURROGATE: &[u8] = b"\xED\xA0\x80";
+    const LOW_SURROGATE: &[u8] = b"\xED\xB0\x80";
 
-#[test]
-fn test_wtf8_string_is_invalid_utf8() {
-    test_string_is_invalid_utf8(WTF8_STRING);
-}
+    for string in [WTF8_STRING, HIGH_SURROGATE, LOW_SURROGATE] {
+        assert_string_is_invalid_utf8(string);
 
-#[test]
-fn test_wtf8_bytes() {
-    assert_eq!(Ok(()), common::test_bytes(WTF8_STRING));
-}
-
-#[test]
-fn test_wtf8_vec() {
-    assert_eq!(Ok(()), common::test_vec(WTF8_STRING));
+        assert_eq!(Ok(()), common::test_bytes(string));
+        assert_eq!(Ok(()), common::test_vec(string));
+    }
 }