From 471104f43b8a1c678dbcffc1d137f009fa881bc7 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Thu, 19 Sep 2024 21:33:24 +0300 Subject: [PATCH] Restore PUA range check, add manual tests --- src/gb18030.rs | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/src/gb18030.rs b/src/gb18030.rs index 515d0e8..4933fac 100644 --- a/src/gb18030.rs +++ b/src/gb18030.rs @@ -388,8 +388,11 @@ fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { let offset = if other_trail < 0x3F { 0x40 } else { 0x41 }; return Some((other_lead + (0x81 + 0x20), other_trail + offset)); } - // CJK Radicals Supplement and U+9FBx ideographs in GBK_BOTTOM - if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0x9FB4, 0x9FBB) { + // CJK Radicals Supplement, PUA, and U+9FBx ideographs in GBK_BOTTOM + if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) + || in_inclusive_range16(bmp, 0x9FB4, 0x9FBB) + || in_inclusive_range16(bmp, 0xE816, 0xE855) + { if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) { let trail = pos + 16; let offset = if trail < 0x3F { 0x40 } else { 0x41 }; @@ -615,11 +618,18 @@ mod tests { decode_gb18030(b"\x81\x80", "\u{4E90}"); decode_gb18030(b"\x81\xFE", "\u{4FA2}"); decode_gb18030(b"\xFE\x40", "\u{FA0C}"); - decode_gb18030(b"\xFE\x7E", "\u{E843}"); decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}"); decode_gb18030(b"\xFE\x80", "\u{4723}"); decode_gb18030(b"\xFE\xFE", "\u{E4C5}"); + // Changes between GB18030-2005 and GB18030-2022 + decode_gb18030(b"\xFE\x7E", "\u{9FB9}"); + decode_gb18030(b"\xA6\xDD", "\u{FE14}"); + + // These mappings remain in place the GB18030-2005 way despite GB18030-2022 + decode_gb18030(b"\x82\x35\x91\x32", "\u{9FB9}"); + decode_gb18030(b"\x84\x31\x83\x30", "\u{FE14}"); + // The difference from the original GB18030 decode_gb18030(b"\xA3\xA0", "\u{3000}"); decode_gb18030(b"\xA1\xA1", "\u{3000}"); @@ -687,6 +697,15 @@ mod tests { // Edge cases encode_gb18030("\u{00F7}", b"\xA1\xC2"); + + // GB18030-2022 + encode_gb18030("\u{9FB9}", b"\xFE\x7E"); + encode_gb18030("\u{FE14}", b"\xA6\xDD"); + encode_gb18030("\u{E843}", b"\xFE\x7E"); + encode_gb18030("\u{E791}", b"\xA6\xDD"); + + // Non-change in GB18030-2022 + encode_gb18030("\u{E817}", b"\xFE\x52"); } #[test] @@ -729,6 +748,15 @@ mod tests { // Edge cases encode_gbk("\u{00F7}", b"\xA1\xC2"); + + // GB18030-2022 + encode_gb18030("\u{9FB9}", b"\xFE\x7E"); + encode_gb18030("\u{FE14}", b"\xA6\xDD"); + encode_gb18030("\u{E843}", b"\xFE\x7E"); + encode_gb18030("\u{E791}", b"\xA6\xDD"); + + // Non-change in GB18030-2022 + encode_gb18030("\u{E817}", b"\xFE\x52"); } #[test]