From db864407200ce3dffb5294b1657f30a74e78b17d Mon Sep 17 00:00:00 2001 From: Kornel Date: Thu, 31 Oct 2024 19:10:44 +0000 Subject: [PATCH] Trim bloated encoding table --- src/base/encoding.rs | 205 ++----------------------------------------- src/rewriter/mod.rs | 11 +-- 2 files changed, 12 insertions(+), 204 deletions(-) diff --git a/src/base/encoding.rs b/src/base/encoding.rs index 03ba25b1..92c517d8 100644 --- a/src/base/encoding.rs +++ b/src/base/encoding.rs @@ -5,193 +5,31 @@ use std::sync::Arc; /// This serves as a map from integer to [`Encoding`], which allows more efficient /// sets/gets of the [`SharedEncoding`]. -static ALL_ENCODINGS: [&Encoding; 228] = [ - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::ISO_8859_2_INIT, - &encoding_rs::ISO_8859_3_INIT, - &encoding_rs::ISO_8859_4_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::ISO_8859_10_INIT, - &encoding_rs::ISO_8859_15_INIT, - &encoding_rs::IBM866_INIT, - &encoding_rs::MACINTOSH_INIT, - &encoding_rs::KOI8_R_INIT, - &encoding_rs::GBK_INIT, - &encoding_rs::BIG5_INIT, +static ALL_ENCODINGS: [&Encoding; 40] = [ &encoding_rs::UTF_8_INIT, - &encoding_rs::KOI8_R_INIT, &encoding_rs::SHIFT_JIS_INIT, - &encoding_rs::UTF_16LE_INIT, - &encoding_rs::SHIFT_JIS_INIT, - &encoding_rs::IBM866_INIT, - &encoding_rs::UTF_8_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::GBK_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::WINDOWS_1250_INIT, - &encoding_rs::WINDOWS_1251_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::GBK_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::ISO_8859_2_INIT, - &encoding_rs::WINDOWS_1253_INIT, - &encoding_rs::ISO_8859_3_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::ISO_8859_4_INIT, - &encoding_rs::WINDOWS_1255_INIT, &encoding_rs::BIG5_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::UTF_16LE_INIT, - &encoding_rs::WINDOWS_1256_INIT, - &encoding_rs::IBM866_INIT, - &encoding_rs::ISO_8859_10_INIT, - &encoding_rs::WINDOWS_1257_INIT, - &encoding_rs::WINDOWS_1258_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::EUC_KR_INIT, &encoding_rs::EUC_JP_INIT, - &encoding_rs::KOI8_R_INIT, - &encoding_rs::KOI8_R_INIT, &encoding_rs::EUC_KR_INIT, - &encoding_rs::SHIFT_JIS_INIT, - &encoding_rs::KOI8_U_INIT, - &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::WINDOWS_874_INIT, &encoding_rs::GB18030_INIT, - &encoding_rs::EUC_KR_INIT, &encoding_rs::GBK_INIT, - &encoding_rs::WINDOWS_874_INIT, - &encoding_rs::BIG5_INIT, - &encoding_rs::UTF_16LE_INIT, - &encoding_rs::GBK_INIT, - &encoding_rs::ISO_8859_8_I_INIT, - &encoding_rs::KOI8_R_INIT, - &encoding_rs::EUC_KR_INIT, - &encoding_rs::KOI8_U_INIT, - &encoding_rs::WINDOWS_1250_INIT, - &encoding_rs::EUC_KR_INIT, - &encoding_rs::WINDOWS_1251_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::GBK_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::ISO_8859_2_INIT, - &encoding_rs::WINDOWS_1253_INIT, - &encoding_rs::ISO_8859_3_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::ISO_8859_4_INIT, - &encoding_rs::WINDOWS_1255_INIT, - &encoding_rs::ISO_8859_5_INIT, - &encoding_rs::BIG5_INIT, - &encoding_rs::WINDOWS_1256_INIT, &encoding_rs::IBM866_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::WINDOWS_1257_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::WINDOWS_1258_INIT, - &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::ISO_8859_5_INIT, - &encoding_rs::UTF_16BE_INIT, - &encoding_rs::UTF_16LE_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::SHIFT_JIS_INIT, - &encoding_rs::EUC_JP_INIT, - &encoding_rs::ISO_8859_10_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::WINDOWS_874_INIT, - &encoding_rs::ISO_8859_2_INIT, - &encoding_rs::ISO_8859_3_INIT, - &encoding_rs::ISO_8859_13_INIT, - &encoding_rs::ISO_8859_4_INIT, - &encoding_rs::ISO_8859_14_INIT, - &encoding_rs::ISO_8859_5_INIT, - &encoding_rs::ISO_8859_15_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::GBK_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::UTF_16LE_INIT, - &encoding_rs::MACINTOSH_INIT, - &encoding_rs::SHIFT_JIS_INIT, - &encoding_rs::SHIFT_JIS_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::ISO_8859_10_INIT, - &encoding_rs::ISO_8859_4_INIT, - &encoding_rs::GBK_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::ISO_8859_2_INIT, - &encoding_rs::WINDOWS_874_INIT, - &encoding_rs::ISO_8859_2_INIT, &encoding_rs::ISO_8859_2_INIT, - &encoding_rs::REPLACEMENT_INIT, - &encoding_rs::ISO_8859_3_INIT, &encoding_rs::ISO_8859_3_INIT, - &encoding_rs::ISO_8859_13_INIT, &encoding_rs::ISO_8859_4_INIT, - &encoding_rs::ISO_8859_4_INIT, - &encoding_rs::ISO_8859_14_INIT, - &encoding_rs::ISO_8859_5_INIT, &encoding_rs::ISO_8859_5_INIT, - &encoding_rs::ISO_8859_5_INIT, - &encoding_rs::ISO_8859_15_INIT, - &encoding_rs::ISO_8859_6_INIT, &encoding_rs::ISO_8859_6_INIT, &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_10_INIT, - &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::ISO_8859_8_I_INIT, &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::ISO_8859_3_INIT, - &encoding_rs::EUC_KR_INIT, - &encoding_rs::BIG5_INIT, - &encoding_rs::SHIFT_JIS_INIT, &encoding_rs::ISO_8859_10_INIT, - &encoding_rs::WINDOWS_874_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::ISO_8859_2_INIT, &encoding_rs::ISO_8859_13_INIT, - &encoding_rs::ISO_8859_3_INIT, &encoding_rs::ISO_8859_14_INIT, - &encoding_rs::WINDOWS_874_INIT, - &encoding_rs::ISO_8859_4_INIT, - &encoding_rs::ISO_8859_15_INIT, &encoding_rs::ISO_8859_15_INIT, - &encoding_rs::WINDOWS_1254_INIT, &encoding_rs::ISO_8859_16_INIT, - &encoding_rs::ISO_8859_10_INIT, - &encoding_rs::EUC_KR_INIT, - &encoding_rs::ISO_8859_15_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::UTF_16BE_INIT, - &encoding_rs::UTF_16LE_INIT, - &encoding_rs::MACINTOSH_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_8_I_INIT, - &encoding_rs::SHIFT_JIS_INIT, + &encoding_rs::KOI8_R_INIT, + &encoding_rs::KOI8_U_INIT, &encoding_rs::MACINTOSH_INIT, - &encoding_rs::REPLACEMENT_INIT, - &encoding_rs::ISO_2022_JP_INIT, - &encoding_rs::ISO_2022_JP_INIT, - &encoding_rs::REPLACEMENT_INIT, - &encoding_rs::REPLACEMENT_INIT, - &encoding_rs::REPLACEMENT_INIT, &encoding_rs::WINDOWS_1250_INIT, &encoding_rs::WINDOWS_1251_INIT, &encoding_rs::WINDOWS_1252_INIT, @@ -201,39 +39,14 @@ static ALL_ENCODINGS: [&Encoding; 228] = [ &encoding_rs::WINDOWS_1256_INIT, &encoding_rs::WINDOWS_1257_INIT, &encoding_rs::WINDOWS_1258_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_8_I_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::EUC_KR_INIT, - &encoding_rs::UTF_8_INIT, - &encoding_rs::UTF_8_INIT, - &encoding_rs::EUC_KR_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::EUC_KR_INIT, + &encoding_rs::WINDOWS_874_INIT, &encoding_rs::X_MAC_CYRILLIC_INIT, &encoding_rs::X_USER_DEFINED_INIT, - &encoding_rs::GBK_INIT, - &encoding_rs::UTF_16LE_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::ISO_8859_2_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::ISO_8859_3_INIT, - &encoding_rs::ISO_8859_4_INIT, - &encoding_rs::ISO_8859_5_INIT, - &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::UTF_8_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::X_MAC_CYRILLIC_INIT, + // non-ASCII-compatible &encoding_rs::REPLACEMENT_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::UTF_8_INIT, - &encoding_rs::ISO_8859_5_INIT, - &encoding_rs::EUC_JP_INIT, + &encoding_rs::UTF_16BE_INIT, + &encoding_rs::UTF_16LE_INIT, + &encoding_rs::ISO_2022_JP_INIT, ]; fn encoding_to_index(encoding: AsciiCompatibleEncoding) -> usize { diff --git a/src/rewriter/mod.rs b/src/rewriter/mod.rs index e6466d50..07afd53c 100644 --- a/src/rewriter/mod.rs +++ b/src/rewriter/mod.rs @@ -31,17 +31,12 @@ impl AsciiCompatibleEncoding { /// Returns `Some` if `Encoding` is ascii-compatible, or `None` otherwise. #[must_use] pub fn new(encoding: &'static Encoding) -> Option { - if encoding.is_ascii_compatible() { - Some(Self(encoding)) - } else { - None - } + encoding.is_ascii_compatible().then_some(Self(encoding)) } fn from_mimetype(mime: &Mime) -> Option { - mime.get_param("charset") - .and_then(|cs| Encoding::for_label_no_replacement(cs.as_str().as_bytes())) - .and_then(Self::new) + let cs = mime.get_param("charset")?; + Self::new(Encoding::for_label_no_replacement(cs.as_str().as_bytes())?) } /// Returns the most commonly used UTF-8 encoding.