Skip to content

Commit

Permalink
Support dynamic charset change on meta tags in HtmlRewriter.
Browse files Browse the repository at this point in the history
  • Loading branch information
orium committed Mar 15, 2023
1 parent a0c3fb2 commit c0cfb1f
Show file tree
Hide file tree
Showing 11 changed files with 255 additions and 33 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ safemem = "0.3.3"
selectors = "0.22.0"
thiserror = "1.0.2"
hashbrown = "0.13.1"
mime = "0.3.16"

[dev-dependencies]
criterion = "0.4.0"
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ cfg_if! {
if #[cfg(feature = "integration_test")] {
pub mod selectors_vm;

pub use self::rewriter::SharedEncoding;

pub use self::transform_stream::{
StartTagHandlingResult, TransformController, TransformStream,
TransformStreamSettings
Expand Down
1 change: 0 additions & 1 deletion src/rewritable_units/document_end.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use super::mutations::content_to_bytes;
use super::ContentType;

use encoding_rs::Encoding;

use crate::transform_stream::OutputSink;
Expand Down
11 changes: 5 additions & 6 deletions src/rewritable_units/tokens/capturer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@ mod to_token;
use self::text_decoder::TextDecoder;
use super::*;
use crate::parser::Lexeme;
use crate::rewriter::RewritingError;
use crate::rewriter::{RewritingError, SharedEncoding};
use bitflags::bitflags;
use encoding_rs::Encoding;

pub use self::to_token::{ToToken, ToTokenResult};

Expand All @@ -30,15 +29,15 @@ pub enum TokenCapturerEvent<'i> {
type CapturerEventHandler<'h> = &'h mut dyn FnMut(TokenCapturerEvent) -> Result<(), RewritingError>;

pub struct TokenCapturer {
encoding: &'static Encoding,
encoding: SharedEncoding,
text_decoder: TextDecoder,
capture_flags: TokenCaptureFlags,
}

impl TokenCapturer {
pub fn new(capture_flags: TokenCaptureFlags, encoding: &'static Encoding) -> Self {
pub fn new(capture_flags: TokenCaptureFlags, encoding: SharedEncoding) -> Self {
TokenCapturer {
encoding,
encoding: SharedEncoding::clone(&encoding),
text_decoder: TextDecoder::new(encoding),
capture_flags,
}
Expand Down Expand Up @@ -70,7 +69,7 @@ impl TokenCapturer {
where
Lexeme<'i, T>: ToToken,
{
match lexeme.to_token(&mut self.capture_flags, self.encoding) {
match lexeme.to_token(&mut self.capture_flags, self.encoding.get()) {
ToTokenResult::Token(token) => {
self.flush_pending_text(&mut event_handler)?;
event_handler(TokenCapturerEvent::LexemeConsumed)?;
Expand Down
10 changes: 5 additions & 5 deletions src/rewritable_units/tokens/capturer/text_decoder.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
use super::*;
use crate::html::TextType;
use crate::rewriter::RewritingError;
use encoding_rs::{CoderResult, Decoder, Encoding};
use encoding_rs::{CoderResult, Decoder};

// NOTE: this can't be refactored into method, because we hold a mutable reference for `self`
// during the decoding loop in `feed_text`.
macro_rules! emit {
($self:tt, $text:expr, $last:ident, $event_handler:ident) => {{
let token = TextChunk::new_token($text, $self.last_text_type, $last, $self.encoding);
let token = TextChunk::new_token($text, $self.last_text_type, $last, $self.encoding.get());

$event_handler(TokenCapturerEvent::TokenProduced(Box::new(token)))
}};
}

pub struct TextDecoder {
encoding: &'static Encoding,
encoding: SharedEncoding,
pending_text_streaming_decoder: Option<Decoder>,
text_buffer: String,
last_text_type: TextType,
}

impl TextDecoder {
pub fn new(encoding: &'static Encoding) -> Self {
pub fn new(encoding: SharedEncoding) -> Self {
TextDecoder {
encoding,
pending_text_streaming_decoder: None,
Expand Down Expand Up @@ -49,7 +49,7 @@ impl TextDecoder {
last: bool,
event_handler: CapturerEventHandler,
) -> Result<(), RewritingError> {
let encoding = self.encoding;
let encoding = self.encoding.get();
let buffer = self.text_buffer.as_mut_str();

let decoder = self
Expand Down
205 changes: 200 additions & 5 deletions src/rewriter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,46 @@ use crate::parser::ParsingAmbiguityError;
use crate::selectors_vm::{self, SelectorMatchingVm};
use crate::transform_stream::*;
use encoding_rs::Encoding;
use mime::Mime;
use std::borrow::Cow;
use std::cell::Cell;
use std::error::Error as StdError;
use std::fmt::{self, Debug};
use std::ops::Deref;
use std::rc::Rc;
use thiserror::Error;

pub use self::settings::*;

#[derive(Clone)]
pub struct SharedEncoding {
encoding: Rc<Cell<AsciiCompatibleEncoding>>,
}

impl SharedEncoding {
pub fn new(encoding: AsciiCompatibleEncoding) -> SharedEncoding {
SharedEncoding {
encoding: Rc::new(Cell::new(encoding)),
}
}

pub fn get(&self) -> &'static Encoding {
self.encoding.get().0
}

pub fn set(&self, encoding: AsciiCompatibleEncoding) {
self.encoding.set(encoding);
}
}

impl Deref for SharedEncoding {
type Target = Encoding;

fn deref(&self) -> &'static Encoding {
self.get()
}
}

/// This is an encoding known to be ASCII-compatible.
///
/// Non-ASCII-compatible encodings (`UTF-16LE`, `UTF-16BE`, `ISO-2022-JP` and
Expand Down Expand Up @@ -140,6 +173,40 @@ macro_rules! guarded {
}};
}

fn ascii_compatible_encoding_from_mimetype(mime: &Mime) -> Option<AsciiCompatibleEncoding> {
mime.get_param("charset")
.and_then(|cs| Encoding::for_label_no_replacement(cs.as_str().as_bytes()))
.and_then(AsciiCompatibleEncoding::new)
}

fn handler_adjust_charset_on_meta_tag(
encoding: SharedEncoding,
) -> (
Cow<'static, crate::Selector>,
ElementContentHandlers<'static>,
) {
element!("meta", move |el| {
let attr_charset = el
.get_attribute("charset")
.and_then(|cs| Encoding::for_label_no_replacement(cs.as_bytes()))
.and_then(AsciiCompatibleEncoding::new);

let attr_http_equiv = el
.get_attribute("http-equiv")
.filter(|http_equiv| http_equiv.eq_ignore_ascii_case("Content-Type"))
.and_then(|_| el.get_attribute("content"))
.and_then(|ct| ct.parse::<Mime>().ok())
.as_ref()
.and_then(ascii_compatible_encoding_from_mimetype);

if let Some(charset) = attr_charset.or(attr_http_equiv) {
encoding.set(charset)
}

Ok(())
})
}

impl<'h, O: OutputSink> HtmlRewriter<'h, O> {
/// Constructs a new rewriter with the provided `settings` that writes
/// the output to the `output_sink`.
Expand All @@ -150,12 +217,24 @@ impl<'h, O: OutputSink> HtmlRewriter<'h, O> {
///
/// [`OutputSink`]: trait.OutputSink.html
pub fn new<'s>(settings: Settings<'h, 's>, output_sink: O) -> Self {
let encoding = settings.encoding;
let encoding = SharedEncoding::new(settings.encoding.into());
let mut selectors_ast = selectors_vm::Ast::default();
let mut dispatcher = ContentHandlersDispatcher::default();
let has_selectors = !settings.element_content_handlers.is_empty();
let has_selectors =
!settings.element_content_handlers.is_empty() || settings.adjust_charset_on_meta_tag;

let charset_adjust_handler = match settings.adjust_charset_on_meta_tag {
true => Some(handler_adjust_charset_on_meta_tag(SharedEncoding::clone(
&encoding,
))),
false => None,
};

let element_content_handlers = charset_adjust_handler
.into_iter()
.chain(settings.element_content_handlers);

for (selector, handlers) in settings.element_content_handlers {
for (selector, handlers) in element_content_handlers {
let locator = dispatcher.add_selector_associated_handlers(handlers);

selectors_ast.add_selector(&selector, locator);
Expand All @@ -171,7 +250,7 @@ impl<'h, O: OutputSink> HtmlRewriter<'h, O> {
let selector_matching_vm = if has_selectors {
Some(SelectorMatchingVm::new(
selectors_ast,
encoding.into(),
settings.encoding.into(),
Rc::clone(&memory_limiter),
settings.enable_esi_tags,
))
Expand All @@ -188,7 +267,7 @@ impl<'h, O: OutputSink> HtmlRewriter<'h, O> {
.memory_settings
.preallocated_parsing_buffer_size,
memory_limiter,
encoding: encoding.into(),
encoding,
strict: settings.strict,
});

Expand Down Expand Up @@ -289,6 +368,7 @@ mod tests {
use crate::html_content::ContentType;
use crate::test_utils::{Output, ASCII_COMPATIBLE_ENCODINGS, NON_ASCII_COMPATIBLE_ENCODINGS};
use encoding_rs::Encoding;
use itertools::Itertools;
use std::cell::RefCell;
use std::convert::TryInto;
use std::rc::Rc;
Expand All @@ -307,6 +387,17 @@ mod tests {
rewriter.end().unwrap();
}

fn rewrite_html_bytes<'a>(html: &[u8], settings: Settings) -> Vec<u8> {
let mut out: Vec<u8> = Vec::with_capacity(html.len());

let mut rewriter = HtmlRewriter::new(settings, |c: &[u8]| out.extend_from_slice(c));

rewriter.write(html).unwrap();
rewriter.end().unwrap();

out
}

#[test]
fn rewrite_html_str() {
let res = rewrite_str(
Expand Down Expand Up @@ -558,6 +649,110 @@ mod tests {
assert_eq!(res, "<span>?</span>");
}

#[test]
fn test_rewrite_adjust_charset_on_meta_tag_attribute_charset() {
use crate::html_content::{ContentType, TextChunk};

let enthusiastic_text_handler = || {
doc_text!(move |text: &mut TextChunk| {
let new_text = text.as_str().replace('!', "!!!");
text.replace(&new_text, ContentType::Text);
Ok(())
})
};
let html: Vec<u8> = [
r#"<meta charset="windows-1251"><html><head></head><body>I love "#
.as_bytes()
.to_vec(),
vec![0xd5, 0xec, 0xb3, 0xcb, 0xdc],
r#"!</body></html>"#.as_bytes().to_vec(),
]
.into_iter()
.concat();
let expected: Vec<u8> = html
.iter()
.copied()
.flat_map(|c| match c {
b'!' => vec![b'!', b'!', b'!'],
c => vec![c],
})
.collect();

let transformed_no_charset_adjustment: Vec<u8> = rewrite_html_bytes(
&html,
Settings {
document_content_handlers: vec![enthusiastic_text_handler()],
..Settings::default()
},
);

// Without charset adjustment the response has to be corrupted:
assert_ne!(transformed_no_charset_adjustment, expected);

let transformed_charset_adjustment: Vec<u8> = rewrite_html_bytes(
&html,
Settings {
document_content_handlers: vec![enthusiastic_text_handler()],
adjust_charset_on_meta_tag: true,
..Settings::default()
},
);

// If it adapts the charset according to the meta tag everything will be correctly
// encoded in windows-1251:
assert_eq!(transformed_charset_adjustment, expected);
}

#[test]
fn test_rewrite_adjust_charset_on_meta_tag_attribute_content_type() {
use crate::html_content::{ContentType, TextChunk};

let enthusiastic_text_handler = || {
doc_text!(move |text: &mut TextChunk| {
let new_text = text.as_str().replace('!', "!!!");
text.replace(&new_text, ContentType::Text);
Ok(())
})
};
let html: Vec<u8> = [
r#"<meta http-equiv="content-type" content="text/html; charset=windows-1251"><html><head></head><body>I love "#.as_bytes().to_vec(),
vec![0xd5, 0xec, 0xb3, 0xcb, 0xdc],
r#"!</body></html>"#.as_bytes().to_vec(),
].into_iter().concat();
let expected: Vec<u8> = html
.iter()
.copied()
.flat_map(|c| match c {
b'!' => vec![b'!', b'!', b'!'],
c => vec![c],
})
.collect();

let transformed_no_charset_adjustment: Vec<u8> = rewrite_html_bytes(
&html,
Settings {
document_content_handlers: vec![enthusiastic_text_handler()],
..Settings::default()
},
);

// Without charset adjustment the response has to be corrupted:
assert_ne!(transformed_no_charset_adjustment, expected);

let transformed_charset_adjustment: Vec<u8> = rewrite_html_bytes(
&html,
Settings {
document_content_handlers: vec![enthusiastic_text_handler()],
adjust_charset_on_meta_tag: true,
..Settings::default()
},
);

// If it adapts the charset according to the meta tag everything will be correctly
// encoded in windows-1251:
assert_eq!(transformed_charset_adjustment, expected);
}

mod fatal_errors {
use super::*;
use crate::errors::MemoryLimitExceededError;
Expand Down
Loading

0 comments on commit c0cfb1f

Please sign in to comment.