Skip to content

Commit

Permalink
feat(transform): add transform_content_send async streaming
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 2, 2024
1 parent 09d7787 commit 7543bf3
Show file tree
Hide file tree
Showing 9 changed files with 137 additions and 17 deletions.
18 changes: 10 additions & 8 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.13.93"
version = "2.13.95"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.13.93"
version = "2.13.95"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.13.93"
version = "2.13.95"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
4 changes: 2 additions & 2 deletions spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.13.93"
version = "2.13.95"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand All @@ -24,7 +24,7 @@ tendril = "0.4"
llm_readability = "0"
thiserror = "1"
serde = { version = "1", features = ["derive"] }
fast_html2md = "0"
fast_html2md = {version = "0", features = ["tokio"]}
phf = "0.11"
phf_codegen = "0.11"
lol_html = { version = "2" }
Expand Down
108 changes: 108 additions & 0 deletions spider_transformations/src/transformation/content.rs
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,11 @@ pub fn transform_markdown(html: &str, commonmark: bool) -> String {
html2md::rewrite_html_custom_with_url(html, &None, commonmark, &None)
}

/// transform the content to markdown shortcut send
pub async fn transform_markdown_send(html: &str, commonmark: bool) -> String {
html2md::rewrite_html_custom_with_url_streaming(html, &None, commonmark, &None).await
}

/// transform the content to text raw shortcut
pub fn transform_text(html: &str) -> String {
super::text_extract::extract_text(html, &Default::default())
Expand Down Expand Up @@ -411,6 +416,109 @@ pub fn transform_content(
}
}

/// Transform format the content send.
pub async fn transform_content_send(
res: &Page,
c: &TransformConfig,
encoding: &Option<String>,
selector_config: &Option<SelectorConfiguration>,
ignore_tags: &Option<Vec<String>>,
) -> String {
let base_html = get_html_with_selector(res, encoding, selector_config);

// prevent transforming binary files or re-encoding it
if is_binary_file(res.get_html_bytes_u8()) {
return base_html;
}

let url_parsed = res.get_url_parsed();

let base_html = {
let mut ignore_list = build_static_vector(c);

if let Some(ignore) = ignore_tags {
ignore_list.extend(ignore.iter().map(|s| s.as_str()));
}

if ignore_list.is_empty() {
base_html
} else {
clean_html_elements(&base_html, ignore_list)
}
};

// process readability
let base_html = if c.readability {
match llm_readability::extractor::extract(
&mut base_html.as_bytes(),
match url_parsed {
Some(u) => u,
_ => &EXAMPLE_URL,
},
) {
Ok(product) => product.content,
_ => base_html,
}
} else {
base_html
};

let base_html = if c.clean_html {
clean_html(&base_html)
} else {
base_html
};

let mut tag_factory = None;

if let Some(ignore) = ignore_tags {
let mut tag_factor = std::collections::HashSet::with_capacity(ignore.len());
for ignore_tag_name in ignore {
tag_factor.insert(ignore_tag_name.into());
}
tag_factory.replace(tag_factor);
}

match c.return_format {
ReturnFormat::Raw | ReturnFormat::Bytes => base_html,
ReturnFormat::CommonMark => {
html2md::rewrite_html_custom_with_url_streaming(
&base_html,
&tag_factory,
true,
url_parsed,
)
.await
}
ReturnFormat::Markdown => {
html2md::rewrite_html_custom_with_url_streaming(
&base_html,
&tag_factory,
false,
url_parsed,
)
.await
}
ReturnFormat::Html2Text => {
if !base_html.is_empty() {
crate::html2text::from_read(base_html.as_bytes(), base_html.len())
} else {
base_html
}
}
ReturnFormat::Text => super::text_extract::extract_text(&base_html, &tag_factory),
ReturnFormat::XML => convert_html_to_xml(
base_html.trim(),
&match url_parsed {
Some(u) => u.to_string(),
_ => EXAMPLE_URL.to_string(),
},
encoding,
)
.unwrap_or_default(),
}
}

/// transform the content to bytes to prevent loss of precision.
pub fn transform_content_to_bytes(
res: &Page,
Expand Down
14 changes: 12 additions & 2 deletions spider_transformations/src/transformation/text_extract.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use html2md::extended::sifter::WhitespaceSifter;
use lol_html::{element, html_content::TextType, rewrite_str, text, RewriteStrSettings};
use lol_html::{element, html_content::TextType, text, RewriteStrSettings};

/// extract the text from HTML document.
pub fn extract_text(html: &str, custom: &Option<std::collections::HashSet<String>>) -> String {
Expand Down Expand Up @@ -43,7 +43,7 @@ pub fn extract_text(html: &str, custom: &Option<std::collections::HashSet<String
}
));

let _ = rewrite_str(
let _ = rewrite_str_empty(
html,
RewriteStrSettings {
element_content_handlers,
Expand All @@ -53,3 +53,13 @@ pub fn extract_text(html: &str, custom: &Option<std::collections::HashSet<String

extracted_text.sift()
}

pub fn rewrite_str_empty<'h, 's, H: lol_html::HandlerTypes>(
html: &str,
settings: impl Into<lol_html::Settings<'h, 's, H>>,
) -> Result<(), lol_html::errors::RewritingError> {
let mut rewriter = lol_html::HtmlRewriter::new(settings.into(), |_c: &[u8]| {});
rewriter.write(html.as_bytes())?;
rewriter.end()?;
Ok(())
}
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.13.93"
version = "2.13.95"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.13.93"
version = "2.13.95"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down

0 comments on commit 7543bf3

Please sign in to comment.