Fix normalization of IRIs

yescallop · Aug 19, 2024 · ca12fc5 · ca12fc5
1 parent fc14575
commit ca12fc5
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -27,8 +27,12 @@ A *[URI reference]* is either a *[URI]* or a *[relative reference]*. If it start
 a relative reference. For example, `//example.org/`, `/index.html`, `../`, `foo`,
 `?bar`, and `#baz` are relative references.
 
+An *[IRI]* (reference) is an internationalized version of URI (reference)
+which may contain non-ASCII characters.
+
 [URI reference]: https://datatracker.ietf.org/doc/html/rfc3986#section-4.1
 [URI]: https://datatracker.ietf.org/doc/html/rfc3986#section-3
+[IRI]: https://datatracker.ietf.org/doc/html/rfc3987#section-2.2
 [relative reference]: https://datatracker.ietf.org/doc/html/rfc3986#section-4.2
 [scheme]: https://datatracker.ietf.org/doc/html/rfc3986#section-3.1
 

diff --git a/src/common.rs b/src/common.rs
@@ -499,12 +499,13 @@ macro_rules! ri_maybe_ref {
             #[doc = concat!("Normalizes the ", $name, ".")]
             ///
             /// This method applies the syntax-based normalization described in
-            /// [Section 6.2.2 of RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986#section-6.2.2),
+            /// [Section 6.2.2 of RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986#section-6.2.2)
+            /// and [Section 5.3.2 of RFC 3987](https://datatracker.ietf.org/doc/html/rfc3987#section-5.3.2),
             /// which is effectively equivalent to taking the following steps in order:
             ///
-            /// - Decode any percent-encoded octet that corresponds to an unreserved character.
+            /// - Decode any percent-encoded octets that correspond to an allowed character which is not reserved.
             /// - Uppercase the hexadecimal digits within all percent-encoded octets.
-            /// - Lowercase the scheme and the host except the percent-encoded octets.
+            /// - Lowercase all ASCII characters within the scheme and the host except the percent-encoded octets.
             /// - Turn any IPv6 literal address into its canonical form as per
             ///   [RFC 5952](https://datatracker.ietf.org/doc/html/rfc5952).
             /// - If the port is empty, remove its `':'` delimiter.
@@ -514,10 +515,9 @@ macro_rules! ri_maybe_ref {
             /// - If `self` contains no authority and its path would start with
             ///   `"//"`, prepend `"/."` to the path.
             ///
-            /// [`UriRef::resolve_against`]: crate::UriRef::resolve_against
-            ///
             /// This method is idempotent: `self.normalize()` equals `self.normalize().normalize()`.
             ///
+            /// [`UriRef::resolve_against`]: crate::UriRef::resolve_against
             /// [`remove_dot_segments`]: https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
             ///
             /// # Examples
@@ -531,7 +531,7 @@ macro_rules! ri_maybe_ref {
             /// ```
             #[must_use]
             pub fn normalize(&self) -> $Ty<String> {
-                RiRef::new_pair(normalizer::normalize(self.as_ref_loose()))
+                RiRef::new_pair(normalizer::normalize(self.as_ref_loose(), $must_be_ascii))
             }
 
             $(

diff --git a/src/encoding/table.rs b/src/encoding/table.rs
@@ -12,13 +12,13 @@ const INDEX_PCT_ENCODED: usize = 256;
 const INDEX_UCSCHAR: usize = 256 + 1;
 const INDEX_IPRIVATE: usize = 256 + 2;
 
-const fn is_ucschar(x: u32) -> bool {
+pub(crate) const fn is_ucschar(x: u32) -> bool {
     matches!(x, 0xa0..=0xd7ff | 0xf900..=0xfdcf | 0xfdf0..=0xffef)
         || (x >= 0x10000 && x <= 0xdffff && (x & 0xffff) <= 0xfffd)
         || (x >= 0xe1000 && x <= 0xefffd)
 }
 
-const fn is_iprivate(x: u32) -> bool {
+pub(crate) const fn is_iprivate(x: u32) -> bool {
     (x >= 0xe000 && x <= 0xf8ff) || (x >= 0xf0000 && (x & 0xffff) <= 0xfffd)
 }
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -36,8 +36,12 @@
 //! a relative reference. For example, `//example.org/`, `/index.html`, `../`, `foo`,
 //! `?bar`, and `#baz` are relative references.
 //!
+//! An *[IRI]* (reference) is an internationalized version of URI (reference)
+//! which may contain non-ASCII characters.
+//!
 //! [URI]: https://datatracker.ietf.org/doc/html/rfc3986#section-3
 //! [URI reference]: https://datatracker.ietf.org/doc/html/rfc3986#section-4.1
+//! [IRI]: https://datatracker.ietf.org/doc/html/rfc3987#section-2.2
 //! [relative reference]: https://datatracker.ietf.org/doc/html/rfc3986#section-4.2
 //! [scheme]: https://datatracker.ietf.org/doc/html/rfc3986#section-3.1
 //!

diff --git a/src/normalizer.rs b/src/normalizer.rs
@@ -1,13 +1,16 @@
 use crate::{
     common::Ref,
-    encoding::{decode_octet, next_code_point, table::UNRESERVED},
+    encoding::{
+        decode_octet, encode_byte, next_code_point,
+        table::{is_iprivate, is_ucschar, UNRESERVED},
+    },
     internal::{HostMeta, Meta},
     parser, resolver,
 };
-use alloc::string::String;
+use alloc::{string::String, vec::Vec};
 use core::{fmt::Write, num::NonZeroUsize};
 
-pub(crate) fn normalize(r: Ref<'_, '_>) -> (String, Meta) {
+pub(crate) fn normalize(r: Ref<'_, '_>, must_be_ascii: bool) -> (String, Meta) {
     // For "a://[::ffff:5:9]/" the capacity is not enough,
     // but it's fine since this rarely happens.
     let mut buf = String::with_capacity(r.as_str().len());
@@ -16,12 +19,12 @@ pub(crate) fn normalize(r: Ref<'_, '_>) -> (String, Meta) {
     let mut path_buf = String::with_capacity(path.len());
 
     if r.has_scheme() && path.starts_with('/') {
-        normalize_estr(&mut buf, path, false);
+        normalize_estr(&mut buf, path, false, must_be_ascii, false);
         resolver::remove_dot_segments(&mut path_buf, &buf);
         buf.clear();
     } else {
         // Don't remove dot segments from relative reference or rootless path.
-        normalize_estr(&mut path_buf, path, false);
+        normalize_estr(&mut path_buf, path, false, must_be_ascii, false);
     }
 
     let mut meta = Meta::default();
@@ -37,7 +40,7 @@ pub(crate) fn normalize(r: Ref<'_, '_>) -> (String, Meta) {
         buf.push_str("//");
 
         if let Some(userinfo) = auth.userinfo() {
-            normalize_estr(&mut buf, userinfo.as_str(), false);
+            normalize_estr(&mut buf, userinfo.as_str(), false, must_be_ascii, false);
             buf.push('@');
         }
 
@@ -63,7 +66,7 @@ pub(crate) fn normalize(r: Ref<'_, '_>) -> (String, Meta) {
             HostMeta::RegName => {
                 let start = buf.len();
                 let host = auth.host();
-                normalize_estr(&mut buf, host, true);
+                normalize_estr(&mut buf, host, true, must_be_ascii, false);
 
                 if buf.len() < start + host.len() {
                     // Only reparse when the length is less than before.
@@ -92,48 +95,103 @@ pub(crate) fn normalize(r: Ref<'_, '_>) -> (String, Meta) {
 
     if let Some(query) = r.query() {
         buf.push('?');
-        normalize_estr(&mut buf, query.as_str(), false);
+        normalize_estr(&mut buf, query.as_str(), false, must_be_ascii, true);
         meta.query_end = NonZeroUsize::new(buf.len());
     }
 
     if let Some(fragment) = r.fragment() {
         buf.push('#');
-        normalize_estr(&mut buf, fragment.as_str(), false);
+        normalize_estr(&mut buf, fragment.as_str(), false, must_be_ascii, false);
     }
 
     (buf, meta)
 }
 
-fn normalize_estr(buf: &mut String, s: &str, to_lowercase: bool) {
+fn normalize_estr(
+    buf: &mut String,
+    s: &str,
+    to_ascii_lowercase: bool,
+    must_be_ascii: bool,
+    is_query: bool,
+) {
     let s = s.as_bytes();
     let mut i = 0;
 
-    while i < s.len() {
-        // FIXME: Change this to also decode encoded ucschar and iprivate chars.
-        if s[i] == b'%' {
-            let (hi, lo) = (s[i + 1], s[i + 2]);
-            let mut octet = decode_octet(hi, lo);
-            if UNRESERVED.allows_ascii(octet) {
-                if to_lowercase {
-                    octet = octet.to_ascii_lowercase();
+    if must_be_ascii {
+        while i < s.len() {
+            let mut x = s[i];
+            if x == b'%' {
+                let (hi, lo) = (s[i + 1], s[i + 2]);
+                let mut octet = decode_octet(hi, lo);
+                if UNRESERVED.allows_ascii(octet) {
+                    if to_ascii_lowercase {
+                        octet = octet.to_ascii_lowercase();
+                    }
+                    buf.push(octet as char);
+                } else {
+                    buf.push('%');
+                    buf.push(hi.to_ascii_uppercase() as char);
+                    buf.push(lo.to_ascii_uppercase() as char);
                 }
-                buf.push(octet as char);
+                i += 3;
             } else {
-                buf.push('%');
-                buf.push(hi.to_ascii_uppercase() as char);
-                buf.push(lo.to_ascii_uppercase() as char);
+                if to_ascii_lowercase {
+                    x = x.to_ascii_lowercase();
+                }
+                buf.push(x as char);
+                i += 1;
             }
-            i += 3;
-        } else {
-            let (x, len) = next_code_point(s, i);
-            let mut x = char::from_u32(x).unwrap();
-            if to_lowercase {
-                x = x.to_ascii_lowercase();
+        }
+    } else {
+        let mut dec_buf = Vec::new();
+
+        while i < s.len() {
+            if s[i] == b'%' {
+                let (hi, lo) = (s[i + 1], s[i + 2]);
+                let mut octet = decode_octet(hi, lo);
+                if UNRESERVED.allows_ascii(octet) {
+                    consume_dec_buf(buf, &mut dec_buf, is_query);
+
+                    if to_ascii_lowercase {
+                        octet = octet.to_ascii_lowercase();
+                    }
+                    buf.push(octet as char);
+                } else {
+                    dec_buf.push(octet);
+                }
+                i += 3;
+            } else {
+                consume_dec_buf(buf, &mut dec_buf, is_query);
+
+                let (x, len) = next_code_point(s, i);
+                let mut x = char::from_u32(x).unwrap();
+                if to_ascii_lowercase {
+                    x = x.to_ascii_lowercase();
+                }
+                buf.push(x);
+                i += len;
             }
-            buf.push(x);
-            i += len;
+        }
+        consume_dec_buf(buf, &mut dec_buf, is_query);
+    }
+}
+
+fn consume_dec_buf(buf: &mut String, dec_buf: &mut Vec<u8>, is_query: bool) {
+    for chunk in dec_buf.utf8_chunks() {
+        for ch in chunk.valid().chars() {
+            if is_ucschar(ch as u32) || (is_query && is_iprivate(ch as u32)) {
+                buf.push(ch);
+            } else {
+                for x in ch.encode_utf8(&mut [0; 4]).bytes() {
+                    encode_byte(x, buf);
+                }
+            }
+        }
+        for &x in chunk.invalid() {
+            encode_byte(x, buf);
         }
     }
+    dec_buf.clear();
 }
 
 // Taken from `impl Display for Ipv6Addr`.