From f1fbd82f08b98962f109869dbea9a8acdcd31a3c Mon Sep 17 00:00:00 2001 From: Kornel Date: Tue, 24 Dec 2024 13:51:24 +0000 Subject: [PATCH 1/2] Make LocalNameHash smaller --- src/html/local_name.rs | 81 +++++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 37 deletions(-) diff --git a/src/html/local_name.rs b/src/html/local_name.rs index a334e000..0d267c0a 100644 --- a/src/html/local_name.rs +++ b/src/html/local_name.rs @@ -26,53 +26,60 @@ use encoding_rs::Encoding; // we are safe here, since we'll just get first character shifted left // by zeroes as repetitave 1 digits get added to the hash. // +// LocalNameHash is built incrementally as tags are parsed, so it needs +// to be able to invalidate itself if parsing an unrepresentable name. +// `EMPTY_HASH` is used as a sentinel value. +// // Pub only for integration tests #[derive(Debug, PartialEq, Eq, Copy, Clone, Default, Hash)] -pub struct LocalNameHash(Option); +pub struct LocalNameHash(u64); + +const EMPTY_HASH: u64 = !0; impl LocalNameHash { #[inline] #[must_use] pub const fn new() -> Self { - Self(Some(0)) + Self(0) } #[inline] #[must_use] pub const fn is_empty(&self) -> bool { - self.0.is_none() + self.0 == EMPTY_HASH } #[inline] pub fn update(&mut self, ch: u8) { - if let Some(h) = self.0 { - // NOTE: check if we still have space for yet another - // character and if not then invalidate the hash. - // Note, that we can't have `1` (which is encoded as 0b00000) as - // a first character of a tag name, so it's safe to perform - // check this way. - self.0 = if h >> (64 - 5) == 0 { - match ch { - // NOTE: apply 0x1F mask on ASCII alpha to convert it to the - // number from 1 to 26 (character case is controlled by one of - // upper bits which we eliminate with the mask). Then add - // 5, since numbers from 0 to 5 are reserved for digits. - // Aftwerards put result as 5 lower bits of the hash. - b'a'..=b'z' | b'A'..=b'Z' => Some((h << 5) | ((u64::from(ch) & 0x1F) + 5)), - - // NOTE: apply 0x0F mask on ASCII digit to convert it to number - // from 1 to 6. Then subtract 1 to make it zero-based. - // Afterwards, put result as lower bits of the hash. - b'1'..=b'6' => Some((h << 5) | ((u64::from(ch) & 0x0F) - 1)), - - // NOTE: for any other characters hash function is not - // applicable, so we completely invalidate the hash. - _ => None, - } - } else { - None - }; - } + let h = self.0; + + // NOTE: check if we still have space for yet another + // character and if not then invalidate the hash. + // Note, that we can't have `1` (which is encoded as 0b00000) as + // a first character of a tag name, so it's safe to perform + // check this way. + // EMPTY_HASH has all bits set, so it will fail this check. + self.0 = if h >> (64 - 5) == 0 { + match ch { + // NOTE: apply 0x1F mask on ASCII alpha to convert it to the + // number from 1 to 26 (character case is controlled by one of + // upper bits which we eliminate with the mask). Then add + // 5, since numbers from 0 to 5 are reserved for digits. + // Aftwerards put result as 5 lower bits of the hash. + b'a'..=b'z' | b'A'..=b'Z' => (h << 5) | ((u64::from(ch) & 0x1F) + 5), + + // NOTE: apply 0x0F mask on ASCII digit to convert it to number + // from 1 to 6. Then subtract 1 to make it zero-based. + // Afterwards, put result as lower bits of the hash. + b'1'..=b'6' => (h << 5) | ((u64::from(ch) & 0x0F) - 1), + + // NOTE: for any other characters hash function is not + // applicable, so we completely invalidate the hash. + _ => EMPTY_HASH, + } + } else { + EMPTY_HASH + }; } } @@ -92,10 +99,7 @@ impl From<&str> for LocalNameHash { impl PartialEq for LocalNameHash { #[inline] fn eq(&self, tag: &Tag) -> bool { - match self.0 { - Some(h) => *tag as u64 == h, - None => false, - } + self.0 == *tag as u64 } } @@ -159,7 +163,10 @@ impl PartialEq> for LocalName<'_> { use LocalName::{Bytes, Hash}; match (self, other) { - (Hash(s), Hash(o)) => s == o, + (Hash(s), Hash(o)) => { + debug_assert!(!s.is_empty()); + s == o + } (Bytes(s), Bytes(o)) => s.eq_ignore_ascii_case(o), _ => false, } @@ -172,7 +179,7 @@ mod tests { #[test] fn from_str() { - assert_eq!(LocalNameHash::from("div"), LocalNameHash(Some(9691u64))); + assert_eq!(LocalNameHash::from("div"), LocalNameHash(9691u64)); } #[test] From f63292ac78dd30d65f8a5d8375e33da95a31c7a0 Mon Sep 17 00:00:00 2001 From: Kornel Date: Tue, 24 Dec 2024 14:22:58 +0000 Subject: [PATCH 2/2] Debug for LocalNameHash --- src/html/local_name.rs | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/html/local_name.rs b/src/html/local_name.rs index 0d267c0a..07626efe 100644 --- a/src/html/local_name.rs +++ b/src/html/local_name.rs @@ -1,6 +1,7 @@ use super::Tag; use crate::base::{Bytes, HasReplacementsError, Range}; use encoding_rs::Encoding; +use std::fmt; // NOTE: All standard tag names contain only ASCII alpha characters // and digits from 1 to 6 (in numbered header tags, i.e.

-

). @@ -31,7 +32,7 @@ use encoding_rs::Encoding; // `EMPTY_HASH` is used as a sentinel value. // // Pub only for integration tests -#[derive(Debug, PartialEq, Eq, Copy, Clone, Default, Hash)] +#[derive(PartialEq, Eq, Copy, Clone, Default, Hash)] pub struct LocalNameHash(u64); const EMPTY_HASH: u64 = !0; @@ -83,6 +84,33 @@ impl LocalNameHash { } } +impl fmt::Debug for LocalNameHash { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.is_empty() { + return f.write_str("N/A"); + } + + let mut reverse_buf = [0u8; 12]; + let mut pos = 11; + let mut h = self.0; + loop { + reverse_buf[pos] = match (h & 31) as u8 { + v @ 6.. => v + (b'a' - 6), + v => v + b'1', + }; + h >>= 5; + if h == 0 || pos == 0 { + break; + } + pos -= 1; + } + std::str::from_utf8(&reverse_buf[pos..]) + .unwrap_or_default() + .fmt(f) + } +} + impl From<&str> for LocalNameHash { #[inline] fn from(string: &str) -> Self {