Skip to content

Commit

Permalink
Fix normalization of IRIs
Browse files Browse the repository at this point in the history
  • Loading branch information
yescallop committed Aug 19, 2024
1 parent fc14575 commit ca12fc5
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 38 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,12 @@ A *[URI reference]* is either a *[URI]* or a *[relative reference]*. If it start
a relative reference. For example, `//example.org/`, `/index.html`, `../`, `foo`,
`?bar`, and `#baz` are relative references.

An *[IRI]* (reference) is an internationalized version of URI (reference)
which may contain non-ASCII characters.

[URI reference]: https://datatracker.ietf.org/doc/html/rfc3986#section-4.1
[URI]: https://datatracker.ietf.org/doc/html/rfc3986#section-3
[IRI]: https://datatracker.ietf.org/doc/html/rfc3987#section-2.2
[relative reference]: https://datatracker.ietf.org/doc/html/rfc3986#section-4.2
[scheme]: https://datatracker.ietf.org/doc/html/rfc3986#section-3.1

Expand Down
12 changes: 6 additions & 6 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -499,12 +499,13 @@ macro_rules! ri_maybe_ref {
#[doc = concat!("Normalizes the ", $name, ".")]
///
/// This method applies the syntax-based normalization described in
/// [Section 6.2.2 of RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986#section-6.2.2),
/// [Section 6.2.2 of RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986#section-6.2.2)
/// and [Section 5.3.2 of RFC 3987](https://datatracker.ietf.org/doc/html/rfc3987#section-5.3.2),
/// which is effectively equivalent to taking the following steps in order:
///
/// - Decode any percent-encoded octet that corresponds to an unreserved character.
/// - Decode any percent-encoded octets that correspond to an allowed character which is not reserved.
/// - Uppercase the hexadecimal digits within all percent-encoded octets.
/// - Lowercase the scheme and the host except the percent-encoded octets.
/// - Lowercase all ASCII characters within the scheme and the host except the percent-encoded octets.
/// - Turn any IPv6 literal address into its canonical form as per
/// [RFC 5952](https://datatracker.ietf.org/doc/html/rfc5952).
/// - If the port is empty, remove its `':'` delimiter.
Expand All @@ -514,10 +515,9 @@ macro_rules! ri_maybe_ref {
/// - If `self` contains no authority and its path would start with
/// `"//"`, prepend `"/."` to the path.
///
/// [`UriRef::resolve_against`]: crate::UriRef::resolve_against
///
/// This method is idempotent: `self.normalize()` equals `self.normalize().normalize()`.
///
/// [`UriRef::resolve_against`]: crate::UriRef::resolve_against
/// [`remove_dot_segments`]: https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
///
/// # Examples
Expand All @@ -531,7 +531,7 @@ macro_rules! ri_maybe_ref {
/// ```
#[must_use]
pub fn normalize(&self) -> $Ty<String> {
RiRef::new_pair(normalizer::normalize(self.as_ref_loose()))
RiRef::new_pair(normalizer::normalize(self.as_ref_loose(), $must_be_ascii))
}

$(
Expand Down
4 changes: 2 additions & 2 deletions src/encoding/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ const INDEX_PCT_ENCODED: usize = 256;
const INDEX_UCSCHAR: usize = 256 + 1;
const INDEX_IPRIVATE: usize = 256 + 2;

const fn is_ucschar(x: u32) -> bool {
pub(crate) const fn is_ucschar(x: u32) -> bool {
matches!(x, 0xa0..=0xd7ff | 0xf900..=0xfdcf | 0xfdf0..=0xffef)
|| (x >= 0x10000 && x <= 0xdffff && (x & 0xffff) <= 0xfffd)
|| (x >= 0xe1000 && x <= 0xefffd)
}

const fn is_iprivate(x: u32) -> bool {
pub(crate) const fn is_iprivate(x: u32) -> bool {
(x >= 0xe000 && x <= 0xf8ff) || (x >= 0xf0000 && (x & 0xffff) <= 0xfffd)
}

Expand Down
4 changes: 4 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,12 @@
//! a relative reference. For example, `//example.org/`, `/index.html`, `../`, `foo`,
//! `?bar`, and `#baz` are relative references.
//!
//! An *[IRI]* (reference) is an internationalized version of URI (reference)
//! which may contain non-ASCII characters.
//!
//! [URI]: https://datatracker.ietf.org/doc/html/rfc3986#section-3
//! [URI reference]: https://datatracker.ietf.org/doc/html/rfc3986#section-4.1
//! [IRI]: https://datatracker.ietf.org/doc/html/rfc3987#section-2.2
//! [relative reference]: https://datatracker.ietf.org/doc/html/rfc3986#section-4.2
//! [scheme]: https://datatracker.ietf.org/doc/html/rfc3986#section-3.1
//!
Expand Down
118 changes: 88 additions & 30 deletions src/normalizer.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
use crate::{
common::Ref,
encoding::{decode_octet, next_code_point, table::UNRESERVED},
encoding::{
decode_octet, encode_byte, next_code_point,
table::{is_iprivate, is_ucschar, UNRESERVED},
},
internal::{HostMeta, Meta},
parser, resolver,
};
use alloc::string::String;
use alloc::{string::String, vec::Vec};
use core::{fmt::Write, num::NonZeroUsize};

pub(crate) fn normalize(r: Ref<'_, '_>) -> (String, Meta) {
pub(crate) fn normalize(r: Ref<'_, '_>, must_be_ascii: bool) -> (String, Meta) {
// For "a://[::ffff:5:9]/" the capacity is not enough,
// but it's fine since this rarely happens.
let mut buf = String::with_capacity(r.as_str().len());
Expand All @@ -16,12 +19,12 @@ pub(crate) fn normalize(r: Ref<'_, '_>) -> (String, Meta) {
let mut path_buf = String::with_capacity(path.len());

if r.has_scheme() && path.starts_with('/') {
normalize_estr(&mut buf, path, false);
normalize_estr(&mut buf, path, false, must_be_ascii, false);
resolver::remove_dot_segments(&mut path_buf, &buf);
buf.clear();
} else {
// Don't remove dot segments from relative reference or rootless path.
normalize_estr(&mut path_buf, path, false);
normalize_estr(&mut path_buf, path, false, must_be_ascii, false);
}

let mut meta = Meta::default();
Expand All @@ -37,7 +40,7 @@ pub(crate) fn normalize(r: Ref<'_, '_>) -> (String, Meta) {
buf.push_str("//");

if let Some(userinfo) = auth.userinfo() {
normalize_estr(&mut buf, userinfo.as_str(), false);
normalize_estr(&mut buf, userinfo.as_str(), false, must_be_ascii, false);
buf.push('@');
}

Expand All @@ -63,7 +66,7 @@ pub(crate) fn normalize(r: Ref<'_, '_>) -> (String, Meta) {
HostMeta::RegName => {
let start = buf.len();
let host = auth.host();
normalize_estr(&mut buf, host, true);
normalize_estr(&mut buf, host, true, must_be_ascii, false);

if buf.len() < start + host.len() {
// Only reparse when the length is less than before.
Expand Down Expand Up @@ -92,48 +95,103 @@ pub(crate) fn normalize(r: Ref<'_, '_>) -> (String, Meta) {

if let Some(query) = r.query() {
buf.push('?');
normalize_estr(&mut buf, query.as_str(), false);
normalize_estr(&mut buf, query.as_str(), false, must_be_ascii, true);
meta.query_end = NonZeroUsize::new(buf.len());
}

if let Some(fragment) = r.fragment() {
buf.push('#');
normalize_estr(&mut buf, fragment.as_str(), false);
normalize_estr(&mut buf, fragment.as_str(), false, must_be_ascii, false);
}

(buf, meta)
}

fn normalize_estr(buf: &mut String, s: &str, to_lowercase: bool) {
fn normalize_estr(
buf: &mut String,
s: &str,
to_ascii_lowercase: bool,
must_be_ascii: bool,
is_query: bool,
) {
let s = s.as_bytes();
let mut i = 0;

while i < s.len() {
// FIXME: Change this to also decode encoded ucschar and iprivate chars.
if s[i] == b'%' {
let (hi, lo) = (s[i + 1], s[i + 2]);
let mut octet = decode_octet(hi, lo);
if UNRESERVED.allows_ascii(octet) {
if to_lowercase {
octet = octet.to_ascii_lowercase();
if must_be_ascii {
while i < s.len() {
let mut x = s[i];
if x == b'%' {
let (hi, lo) = (s[i + 1], s[i + 2]);
let mut octet = decode_octet(hi, lo);
if UNRESERVED.allows_ascii(octet) {
if to_ascii_lowercase {
octet = octet.to_ascii_lowercase();
}
buf.push(octet as char);
} else {
buf.push('%');
buf.push(hi.to_ascii_uppercase() as char);
buf.push(lo.to_ascii_uppercase() as char);
}
buf.push(octet as char);
i += 3;
} else {
buf.push('%');
buf.push(hi.to_ascii_uppercase() as char);
buf.push(lo.to_ascii_uppercase() as char);
if to_ascii_lowercase {
x = x.to_ascii_lowercase();
}
buf.push(x as char);
i += 1;
}
i += 3;
} else {
let (x, len) = next_code_point(s, i);
let mut x = char::from_u32(x).unwrap();
if to_lowercase {
x = x.to_ascii_lowercase();
}
} else {
let mut dec_buf = Vec::new();

while i < s.len() {
if s[i] == b'%' {
let (hi, lo) = (s[i + 1], s[i + 2]);
let mut octet = decode_octet(hi, lo);
if UNRESERVED.allows_ascii(octet) {
consume_dec_buf(buf, &mut dec_buf, is_query);

if to_ascii_lowercase {
octet = octet.to_ascii_lowercase();
}
buf.push(octet as char);
} else {
dec_buf.push(octet);
}
i += 3;
} else {
consume_dec_buf(buf, &mut dec_buf, is_query);

let (x, len) = next_code_point(s, i);
let mut x = char::from_u32(x).unwrap();
if to_ascii_lowercase {
x = x.to_ascii_lowercase();
}
buf.push(x);
i += len;
}
buf.push(x);
i += len;
}
consume_dec_buf(buf, &mut dec_buf, is_query);
}
}

fn consume_dec_buf(buf: &mut String, dec_buf: &mut Vec<u8>, is_query: bool) {
for chunk in dec_buf.utf8_chunks() {
for ch in chunk.valid().chars() {
if is_ucschar(ch as u32) || (is_query && is_iprivate(ch as u32)) {
buf.push(ch);
} else {
for x in ch.encode_utf8(&mut [0; 4]).bytes() {
encode_byte(x, buf);
}
}
}
for &x in chunk.invalid() {
encode_byte(x, buf);
}
}
dec_buf.clear();
}

// Taken from `impl Display for Ipv6Addr`.
Expand Down

0 comments on commit ca12fc5

Please sign in to comment.