Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support dynamic charset change on meta tags in HtmlRewriter. #162

Merged
merged 1 commit into from
Mar 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ safemem = "0.3.3"
selectors = "0.22.0"
thiserror = "1.0.2"
hashbrown = "0.13.1"
mime = "0.3.16"

[dev-dependencies]
criterion = "0.4.0"
Expand Down
2 changes: 2 additions & 0 deletions c-api/src/rewriter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ pub extern "C" fn lol_html_rewriter_build(
memory_settings,
strict,
enable_esi_tags: false,
adjust_charset_on_meta_tag: false,
};

let output_sink = ExternOutputSink::new(output_sink, output_sink_user_data);
Expand Down Expand Up @@ -88,6 +89,7 @@ pub extern "C" fn unstable_lol_html_rewriter_build_with_esi_tags(
memory_settings,
strict,
enable_esi_tags: true,
adjust_charset_on_meta_tag: false,
};

let output_sink = ExternOutputSink::new(output_sink, output_sink_user_data);
Expand Down
1 change: 1 addition & 0 deletions fuzz/test_case/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ fn run_rewriter_iter(data: &[u8], selector: &str, encoding: &'static Encoding) -
encoding: encoding.try_into().unwrap(),
memory_settings: MemorySettings::default(),
strict: false,
adjust_charset_on_meta_tag: false,
},
|_: &[u8]| {},
);
Expand Down
34 changes: 34 additions & 0 deletions src/base/encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use crate::rewriter::AsciiCompatibleEncoding;
use encoding_rs::Encoding;
use std::cell::Cell;
use std::ops::Deref;
use std::rc::Rc;

#[derive(Clone)]
pub struct SharedEncoding {
encoding: Rc<Cell<AsciiCompatibleEncoding>>,
}

impl SharedEncoding {
pub fn new(encoding: AsciiCompatibleEncoding) -> SharedEncoding {
SharedEncoding {
encoding: Rc::new(Cell::new(encoding)),
}
}

pub fn get(&self) -> &'static Encoding {
self.encoding.get().into()
}

pub fn set(&self, encoding: AsciiCompatibleEncoding) {
self.encoding.set(encoding);
}
}

impl Deref for SharedEncoding {
type Target = Encoding;

fn deref(&self) -> &'static Encoding {
self.get()
}
}
2 changes: 2 additions & 0 deletions src/base/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ mod debug_trace;

mod align;
mod bytes;
mod encoding;
mod range;

pub use self::align::Align;
pub use self::bytes::{Bytes, HasReplacementsError};
pub use self::encoding::SharedEncoding;
pub use self::range::Range;
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ cfg_if! {
if #[cfg(feature = "integration_test")] {
pub mod selectors_vm;

pub use self::base::SharedEncoding;

pub use self::transform_stream::{
StartTagHandlingResult, TransformController, TransformStream,
TransformStreamSettings
Expand Down
1 change: 0 additions & 1 deletion src/rewritable_units/document_end.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use super::mutations::content_to_bytes;
use super::ContentType;

use encoding_rs::Encoding;

use crate::transform_stream::OutputSink;
Expand Down
1 change: 1 addition & 0 deletions src/rewritable_units/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ mod tokens;

#[cfg(test)]
mod test_utils {
use crate::rewriter::AsciiCompatibleEncoding;
use crate::test_utils::{Output, ASCII_COMPATIBLE_ENCODINGS};
use crate::*;
use encoding_rs::Encoding;
Expand Down
10 changes: 5 additions & 5 deletions src/rewritable_units/tokens/capturer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ mod to_token;

use self::text_decoder::TextDecoder;
use super::*;
use crate::base::SharedEncoding;
use crate::parser::Lexeme;
use crate::rewriter::RewritingError;
use bitflags::bitflags;
use encoding_rs::Encoding;

pub use self::to_token::{ToToken, ToTokenResult};

Expand All @@ -30,15 +30,15 @@ pub enum TokenCapturerEvent<'i> {
type CapturerEventHandler<'h> = &'h mut dyn FnMut(TokenCapturerEvent) -> Result<(), RewritingError>;

pub struct TokenCapturer {
encoding: &'static Encoding,
encoding: SharedEncoding,
text_decoder: TextDecoder,
capture_flags: TokenCaptureFlags,
}

impl TokenCapturer {
pub fn new(capture_flags: TokenCaptureFlags, encoding: &'static Encoding) -> Self {
pub fn new(capture_flags: TokenCaptureFlags, encoding: SharedEncoding) -> Self {
TokenCapturer {
encoding,
encoding: SharedEncoding::clone(&encoding),
text_decoder: TextDecoder::new(encoding),
capture_flags,
}
Expand Down Expand Up @@ -70,7 +70,7 @@ impl TokenCapturer {
where
Lexeme<'i, T>: ToToken,
{
match lexeme.to_token(&mut self.capture_flags, self.encoding) {
match lexeme.to_token(&mut self.capture_flags, self.encoding.get()) {
ToTokenResult::Token(token) => {
self.flush_pending_text(&mut event_handler)?;
event_handler(TokenCapturerEvent::LexemeConsumed)?;
Expand Down
11 changes: 6 additions & 5 deletions src/rewritable_units/tokens/capturer/text_decoder.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
use super::*;
use crate::base::SharedEncoding;
use crate::html::TextType;
use crate::rewriter::RewritingError;
use encoding_rs::{CoderResult, Decoder, Encoding};
use encoding_rs::{CoderResult, Decoder};

// NOTE: this can't be refactored into method, because we hold a mutable reference for `self`
// during the decoding loop in `feed_text`.
macro_rules! emit {
($self:tt, $text:expr, $last:ident, $event_handler:ident) => {{
let token = TextChunk::new_token($text, $self.last_text_type, $last, $self.encoding);
let token = TextChunk::new_token($text, $self.last_text_type, $last, $self.encoding.get());

$event_handler(TokenCapturerEvent::TokenProduced(Box::new(token)))
}};
}

pub struct TextDecoder {
encoding: &'static Encoding,
encoding: SharedEncoding,
pending_text_streaming_decoder: Option<Decoder>,
text_buffer: String,
last_text_type: TextType,
}

impl TextDecoder {
pub fn new(encoding: &'static Encoding) -> Self {
pub fn new(encoding: SharedEncoding) -> Self {
TextDecoder {
encoding,
pending_text_streaming_decoder: None,
Expand Down Expand Up @@ -49,7 +50,7 @@ impl TextDecoder {
last: bool,
event_handler: CapturerEventHandler,
) -> Result<(), RewritingError> {
let encoding = self.encoding;
let encoding = self.encoding.get();
let buffer = self.text_buffer.as_mut_str();

let decoder = self
Expand Down
Loading