diff --git a/Cargo.toml b/Cargo.toml index a7f0b615..518f4e37 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,6 +45,7 @@ safemem = "0.3.3" selectors = "0.22.0" thiserror = "1.0.2" hashbrown = "0.13.1" +mime = "0.3.16" [dev-dependencies] criterion = "0.4.0" diff --git a/src/lib.rs b/src/lib.rs index 707308a7..7940aff6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -154,6 +154,8 @@ cfg_if! { if #[cfg(feature = "integration_test")] { pub mod selectors_vm; + pub use self::rewriter::SharedEncoding; + pub use self::transform_stream::{ StartTagHandlingResult, TransformController, TransformStream, TransformStreamSettings diff --git a/src/rewritable_units/document_end.rs b/src/rewritable_units/document_end.rs index f8e0d9f7..daa5af2c 100644 --- a/src/rewritable_units/document_end.rs +++ b/src/rewritable_units/document_end.rs @@ -1,6 +1,5 @@ use super::mutations::content_to_bytes; use super::ContentType; - use encoding_rs::Encoding; use crate::transform_stream::OutputSink; diff --git a/src/rewritable_units/tokens/capturer/mod.rs b/src/rewritable_units/tokens/capturer/mod.rs index c2150bb3..d4d02d6b 100644 --- a/src/rewritable_units/tokens/capturer/mod.rs +++ b/src/rewritable_units/tokens/capturer/mod.rs @@ -4,9 +4,8 @@ mod to_token; use self::text_decoder::TextDecoder; use super::*; use crate::parser::Lexeme; -use crate::rewriter::RewritingError; +use crate::rewriter::{RewritingError, SharedEncoding}; use bitflags::bitflags; -use encoding_rs::Encoding; pub use self::to_token::{ToToken, ToTokenResult}; @@ -30,15 +29,15 @@ pub enum TokenCapturerEvent<'i> { type CapturerEventHandler<'h> = &'h mut dyn FnMut(TokenCapturerEvent) -> Result<(), RewritingError>; pub struct TokenCapturer { - encoding: &'static Encoding, + encoding: SharedEncoding, text_decoder: TextDecoder, capture_flags: TokenCaptureFlags, } impl TokenCapturer { - pub fn new(capture_flags: TokenCaptureFlags, encoding: &'static Encoding) -> Self { + pub fn new(capture_flags: TokenCaptureFlags, encoding: SharedEncoding) -> Self { TokenCapturer { - encoding, + encoding: SharedEncoding::clone(&encoding), text_decoder: TextDecoder::new(encoding), capture_flags, } @@ -70,7 +69,7 @@ impl TokenCapturer { where Lexeme<'i, T>: ToToken, { - match lexeme.to_token(&mut self.capture_flags, self.encoding) { + match lexeme.to_token(&mut self.capture_flags, self.encoding.get()) { ToTokenResult::Token(token) => { self.flush_pending_text(&mut event_handler)?; event_handler(TokenCapturerEvent::LexemeConsumed)?; diff --git a/src/rewritable_units/tokens/capturer/text_decoder.rs b/src/rewritable_units/tokens/capturer/text_decoder.rs index 74c7097d..8374007e 100644 --- a/src/rewritable_units/tokens/capturer/text_decoder.rs +++ b/src/rewritable_units/tokens/capturer/text_decoder.rs @@ -1,27 +1,27 @@ use super::*; use crate::html::TextType; use crate::rewriter::RewritingError; -use encoding_rs::{CoderResult, Decoder, Encoding}; +use encoding_rs::{CoderResult, Decoder}; // NOTE: this can't be refactored into method, because we hold a mutable reference for `self` // during the decoding loop in `feed_text`. macro_rules! emit { ($self:tt, $text:expr, $last:ident, $event_handler:ident) => {{ - let token = TextChunk::new_token($text, $self.last_text_type, $last, $self.encoding); + let token = TextChunk::new_token($text, $self.last_text_type, $last, $self.encoding.get()); $event_handler(TokenCapturerEvent::TokenProduced(Box::new(token))) }}; } pub struct TextDecoder { - encoding: &'static Encoding, + encoding: SharedEncoding, pending_text_streaming_decoder: Option, text_buffer: String, last_text_type: TextType, } impl TextDecoder { - pub fn new(encoding: &'static Encoding) -> Self { + pub fn new(encoding: SharedEncoding) -> Self { TextDecoder { encoding, pending_text_streaming_decoder: None, @@ -49,7 +49,7 @@ impl TextDecoder { last: bool, event_handler: CapturerEventHandler, ) -> Result<(), RewritingError> { - let encoding = self.encoding; + let encoding = self.encoding.get(); let buffer = self.text_buffer.as_mut_str(); let decoder = self diff --git a/src/rewriter/mod.rs b/src/rewriter/mod.rs index 51156a00..b0fc4851 100644 --- a/src/rewriter/mod.rs +++ b/src/rewriter/mod.rs @@ -12,13 +12,46 @@ use crate::parser::ParsingAmbiguityError; use crate::selectors_vm::{self, SelectorMatchingVm}; use crate::transform_stream::*; use encoding_rs::Encoding; +use mime::Mime; +use std::borrow::Cow; +use std::cell::Cell; use std::error::Error as StdError; use std::fmt::{self, Debug}; +use std::ops::Deref; use std::rc::Rc; use thiserror::Error; pub use self::settings::*; +#[derive(Clone)] +pub struct SharedEncoding { + encoding: Rc>, +} + +impl SharedEncoding { + pub fn new(encoding: AsciiCompatibleEncoding) -> SharedEncoding { + SharedEncoding { + encoding: Rc::new(Cell::new(encoding)), + } + } + + pub fn get(&self) -> &'static Encoding { + self.encoding.get().0 + } + + pub fn set(&self, encoding: AsciiCompatibleEncoding) { + self.encoding.set(encoding); + } +} + +impl Deref for SharedEncoding { + type Target = Encoding; + + fn deref(&self) -> &'static Encoding { + self.get() + } +} + /// This is an encoding known to be ASCII-compatible. /// /// Non-ASCII-compatible encodings (`UTF-16LE`, `UTF-16BE`, `ISO-2022-JP` and @@ -140,6 +173,40 @@ macro_rules! guarded { }}; } +fn ascii_compatible_encoding_from_mimetype(mime: &Mime) -> Option { + mime.get_param("charset") + .and_then(|cs| Encoding::for_label_no_replacement(cs.as_str().as_bytes())) + .and_then(AsciiCompatibleEncoding::new) +} + +fn handler_adjust_charset_on_meta_tag( + encoding: SharedEncoding, +) -> ( + Cow<'static, crate::Selector>, + ElementContentHandlers<'static>, +) { + element!("meta", move |el| { + let attr_charset = el + .get_attribute("charset") + .and_then(|cs| Encoding::for_label_no_replacement(cs.as_bytes())) + .and_then(AsciiCompatibleEncoding::new); + + let attr_http_equiv = el + .get_attribute("http-equiv") + .filter(|http_equiv| http_equiv.eq_ignore_ascii_case("Content-Type")) + .and_then(|_| el.get_attribute("content")) + .and_then(|ct| ct.parse::().ok()) + .as_ref() + .and_then(ascii_compatible_encoding_from_mimetype); + + if let Some(charset) = attr_charset.or(attr_http_equiv) { + encoding.set(charset) + } + + Ok(()) + }) +} + impl<'h, O: OutputSink> HtmlRewriter<'h, O> { /// Constructs a new rewriter with the provided `settings` that writes /// the output to the `output_sink`. @@ -150,12 +217,24 @@ impl<'h, O: OutputSink> HtmlRewriter<'h, O> { /// /// [`OutputSink`]: trait.OutputSink.html pub fn new<'s>(settings: Settings<'h, 's>, output_sink: O) -> Self { - let encoding = settings.encoding; + let encoding = SharedEncoding::new(settings.encoding.into()); let mut selectors_ast = selectors_vm::Ast::default(); let mut dispatcher = ContentHandlersDispatcher::default(); - let has_selectors = !settings.element_content_handlers.is_empty(); + let has_selectors = + !settings.element_content_handlers.is_empty() || settings.adjust_charset_on_meta_tag; + + let charset_adjust_handler = match settings.adjust_charset_on_meta_tag { + true => Some(handler_adjust_charset_on_meta_tag(SharedEncoding::clone( + &encoding, + ))), + false => None, + }; + + let element_content_handlers = charset_adjust_handler + .into_iter() + .chain(settings.element_content_handlers); - for (selector, handlers) in settings.element_content_handlers { + for (selector, handlers) in element_content_handlers { let locator = dispatcher.add_selector_associated_handlers(handlers); selectors_ast.add_selector(&selector, locator); @@ -171,7 +250,7 @@ impl<'h, O: OutputSink> HtmlRewriter<'h, O> { let selector_matching_vm = if has_selectors { Some(SelectorMatchingVm::new( selectors_ast, - encoding.into(), + settings.encoding.into(), Rc::clone(&memory_limiter), settings.enable_esi_tags, )) @@ -188,7 +267,7 @@ impl<'h, O: OutputSink> HtmlRewriter<'h, O> { .memory_settings .preallocated_parsing_buffer_size, memory_limiter, - encoding: encoding.into(), + encoding, strict: settings.strict, }); @@ -289,6 +368,7 @@ mod tests { use crate::html_content::ContentType; use crate::test_utils::{Output, ASCII_COMPATIBLE_ENCODINGS, NON_ASCII_COMPATIBLE_ENCODINGS}; use encoding_rs::Encoding; + use itertools::Itertools; use std::cell::RefCell; use std::convert::TryInto; use std::rc::Rc; @@ -307,6 +387,17 @@ mod tests { rewriter.end().unwrap(); } + fn rewrite_html_bytes<'a>(html: &[u8], settings: Settings) -> Vec { + let mut out: Vec = Vec::with_capacity(html.len()); + + let mut rewriter = HtmlRewriter::new(settings, |c: &[u8]| out.extend_from_slice(c)); + + rewriter.write(html).unwrap(); + rewriter.end().unwrap(); + + out + } + #[test] fn rewrite_html_str() { let res = rewrite_str( @@ -558,6 +649,110 @@ mod tests { assert_eq!(res, "?"); } + #[test] + fn test_rewrite_adjust_charset_on_meta_tag_attribute_charset() { + use crate::html_content::{ContentType, TextChunk}; + + let enthusiastic_text_handler = || { + doc_text!(move |text: &mut TextChunk| { + let new_text = text.as_str().replace('!', "!!!"); + text.replace(&new_text, ContentType::Text); + Ok(()) + }) + }; + let html: Vec = [ + r#"I love "# + .as_bytes() + .to_vec(), + vec![0xd5, 0xec, 0xb3, 0xcb, 0xdc], + r#"!"#.as_bytes().to_vec(), + ] + .into_iter() + .concat(); + let expected: Vec = html + .iter() + .copied() + .flat_map(|c| match c { + b'!' => vec![b'!', b'!', b'!'], + c => vec![c], + }) + .collect(); + + let transformed_no_charset_adjustment: Vec = rewrite_html_bytes( + &html, + Settings { + document_content_handlers: vec![enthusiastic_text_handler()], + ..Settings::default() + }, + ); + + // Without charset adjustment the response has to be corrupted: + assert_ne!(transformed_no_charset_adjustment, expected); + + let transformed_charset_adjustment: Vec = rewrite_html_bytes( + &html, + Settings { + document_content_handlers: vec![enthusiastic_text_handler()], + adjust_charset_on_meta_tag: true, + ..Settings::default() + }, + ); + + // If it adapts the charset according to the meta tag everything will be correctly + // encoded in windows-1251: + assert_eq!(transformed_charset_adjustment, expected); + } + + #[test] + fn test_rewrite_adjust_charset_on_meta_tag_attribute_content_type() { + use crate::html_content::{ContentType, TextChunk}; + + let enthusiastic_text_handler = || { + doc_text!(move |text: &mut TextChunk| { + let new_text = text.as_str().replace('!', "!!!"); + text.replace(&new_text, ContentType::Text); + Ok(()) + }) + }; + let html: Vec = [ + r#"I love "#.as_bytes().to_vec(), + vec![0xd5, 0xec, 0xb3, 0xcb, 0xdc], + r#"!"#.as_bytes().to_vec(), + ].into_iter().concat(); + let expected: Vec = html + .iter() + .copied() + .flat_map(|c| match c { + b'!' => vec![b'!', b'!', b'!'], + c => vec![c], + }) + .collect(); + + let transformed_no_charset_adjustment: Vec = rewrite_html_bytes( + &html, + Settings { + document_content_handlers: vec![enthusiastic_text_handler()], + ..Settings::default() + }, + ); + + // Without charset adjustment the response has to be corrupted: + assert_ne!(transformed_no_charset_adjustment, expected); + + let transformed_charset_adjustment: Vec = rewrite_html_bytes( + &html, + Settings { + document_content_handlers: vec![enthusiastic_text_handler()], + adjust_charset_on_meta_tag: true, + ..Settings::default() + }, + ); + + // If it adapts the charset according to the meta tag everything will be correctly + // encoded in windows-1251: + assert_eq!(transformed_charset_adjustment, expected); + } + mod fatal_errors { use super::*; use crate::errors::MemoryLimitExceededError; diff --git a/src/rewriter/settings.rs b/src/rewriter/settings.rs index b5822087..090aaabb 100644 --- a/src/rewriter/settings.rs +++ b/src/rewriter/settings.rs @@ -518,6 +518,32 @@ pub struct Settings<'h, 's> { pub strict: bool, pub enable_esi_tags: bool, + + /// If enabled the rewriter will dynamically change the charset when it encounters a `meta` tag + /// that specifies the charset. + /// + /// The charset can be modified by the `meta` tag with + /// + /// ```html + /// + /// ``` + /// + /// or + /// + /// ```html + /// + /// ``` + /// + /// Note that an explicit `charset` in the `Content-type` header should take precedence over + /// the `meta` tag, so only enable this if the content type does not explicitly specify a + /// charset. For details check [this][html5encoding]. + /// + /// [html5encoding]: https://blog.whatwg.org/the-road-to-html-5-character-encoding + /// + /// ### Default + /// + /// `false` when constructed with `Settings::default()`. + pub adjust_charset_on_meta_tag: bool, } impl Default for Settings<'_, '_> { @@ -530,6 +556,7 @@ impl Default for Settings<'_, '_> { memory_settings: MemorySettings::default(), strict: true, enable_esi_tags: false, + adjust_charset_on_meta_tag: false, } } } diff --git a/src/selectors_vm/mod.rs b/src/selectors_vm/mod.rs index a4c4d2eb..3d2fc779 100644 --- a/src/selectors_vm/mod.rs +++ b/src/selectors_vm/mod.rs @@ -550,10 +550,11 @@ mod tests { use crate::html::Namespace; use crate::memory::MemoryLimiter; use crate::rewritable_units::{DocumentEnd, Token, TokenCaptureFlags}; + use crate::rewriter::SharedEncoding; use crate::transform_stream::{ StartTagHandlingResult, TransformController, TransformStream, TransformStreamSettings, }; - use encoding_rs::UTF_8; + use encoding_rs::{Encoding, UTF_8}; use hashbrown::{HashMap, HashSet}; struct Expectation { @@ -629,7 +630,7 @@ mod tests { transform_controller: TestTransformController(test_fn), output_sink: |_: &[u8]| {}, preallocated_parsing_buffer_size: 0, - encoding, + encoding: SharedEncoding::new(crate::AsciiCompatibleEncoding::new(encoding).unwrap()), memory_limiter: MemoryLimiter::new_shared(2048), strict: true, }); diff --git a/src/transform_stream/dispatcher.rs b/src/transform_stream/dispatcher.rs index de55ad37..1bc9f0f9 100644 --- a/src/transform_stream/dispatcher.rs +++ b/src/transform_stream/dispatcher.rs @@ -8,8 +8,7 @@ use crate::parser::{ use crate::rewritable_units::{ DocumentEnd, Serialize, ToToken, Token, TokenCaptureFlags, TokenCapturer, TokenCapturerEvent, }; -use crate::rewriter::RewritingError; -use encoding_rs::Encoding; +use crate::rewriter::{RewritingError, SharedEncoding}; use std::rc::Rc; use TagTokenOutline::*; @@ -72,7 +71,7 @@ where got_flags_from_hint: bool, pending_element_aux_info_req: Option>, emission_enabled: bool, - encoding: &'static Encoding, + encoding: SharedEncoding, } impl Dispatcher @@ -80,14 +79,17 @@ where C: TransformController, O: OutputSink, { - pub fn new(transform_controller: C, output_sink: O, encoding: &'static Encoding) -> Self { + pub fn new(transform_controller: C, output_sink: O, encoding: SharedEncoding) -> Self { let initial_capture_flags = transform_controller.initial_capture_flags(); Dispatcher { transform_controller, output_sink, remaining_content_start: 0, - token_capturer: TokenCapturer::new(initial_capture_flags, encoding), + token_capturer: TokenCapturer::new( + initial_capture_flags, + SharedEncoding::clone(&encoding), + ), got_flags_from_hint: false, pending_element_aux_info_req: None, emission_enabled: true, @@ -108,7 +110,7 @@ where pub fn finish(&mut self, input: &[u8]) -> Result<(), RewritingError> { self.flush_remaining_input(input, input.len()); - let mut document_end = DocumentEnd::new(&mut self.output_sink, self.encoding); + let mut document_end = DocumentEnd::new(&mut self.output_sink, self.encoding.get()); self.transform_controller.handle_end(&mut document_end)?; diff --git a/src/transform_stream/mod.rs b/src/transform_stream/mod.rs index f543a4ab..8cf22610 100644 --- a/src/transform_stream/mod.rs +++ b/src/transform_stream/mod.rs @@ -3,8 +3,7 @@ mod dispatcher; use self::dispatcher::Dispatcher; use crate::memory::{Arena, SharedMemoryLimiter}; use crate::parser::{Parser, ParserDirective, SharedAttributeBuffer}; -use crate::rewriter::RewritingError; -use encoding_rs::Encoding; +use crate::rewriter::{RewritingError, SharedEncoding}; use std::cell::RefCell; use std::rc::Rc; @@ -21,7 +20,7 @@ where pub output_sink: O, pub preallocated_parsing_buffer_size: usize, pub memory_limiter: SharedMemoryLimiter, - pub encoding: &'static Encoding, + pub encoding: SharedEncoding, pub strict: bool, } diff --git a/tests/fixtures/token_capturing.rs b/tests/fixtures/token_capturing.rs index 1e72bb8f..326e4eb4 100644 --- a/tests/fixtures/token_capturing.rs +++ b/tests/fixtures/token_capturing.rs @@ -2,10 +2,7 @@ use crate::harness::suites::html5lib_tests::{ get_test_cases, TestCase, TestToken, TestTokenList, }; use crate::harness::{TestFixture, Input}; -use lol_html::{ - LocalNameHash, TokenCaptureFlags, LocalName, Token, StartTagHandlingResult, TransformController, - TransformStream, Namespace, TransformStreamSettings, MemoryLimiter -}; +use lol_html::{LocalNameHash, TokenCaptureFlags, LocalName, Token, StartTagHandlingResult, TransformController, TransformStream, Namespace, TransformStreamSettings, MemoryLimiter, SharedEncoding}; use lol_html::errors::RewritingError; use lol_html::html_content::{DocumentEnd, TextType}; use lol_html::test_utils::Output; @@ -106,7 +103,7 @@ pub fn parse( output_sink: |chunk: &[u8]| output.push(chunk), preallocated_parsing_buffer_size: 0, memory_limiter, - encoding: encoding.into(), + encoding: SharedEncoding::new(encoding.into()), strict: true } );