Skip to content

Commit

Permalink
perf(transform): add rewriter markdown handling
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 15, 2024
1 parent 6066cdb commit 0ee519e
Show file tree
Hide file tree
Showing 9 changed files with 462 additions and 195 deletions.
579 changes: 440 additions & 139 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.13.8"
version = "2.13.9"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.13.8"
version = "2.13.9"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.13.8"
version = "2.13.9"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.13.8"
version = "2.13.9"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 2 additions & 0 deletions spider_transformations/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ pub mod html2xml;
mod markup5ever_rcdom;
/// Base transformations.
pub mod transformation;
// shortcut
pub use transformation::content::{transform_content, transform_content_to_bytes};
64 changes: 14 additions & 50 deletions spider_transformations/src/transformation/content.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use crate::html2xml::convert_html_to_xml;
use aho_corasick::AhoCorasick;
use html2md;
use html2md::ignore::IgnoreTagFactory;
use phf::phf_set;
use regex::Regex;
use serde::{Deserialize, Deserializer};
Expand All @@ -13,7 +12,6 @@ use spider::packages::scraper::{ElementRef, Selector};
use spider::page::Page;
use spider::url::Url;
use spider::utils::clean_html;
use std::collections::HashMap;

lazy_static! {
static ref AHO: AhoCorasick = AhoCorasick::new(["\n\n\n", "\n \n ", "\n\n\n\n\n"]).unwrap();
Expand Down Expand Up @@ -263,22 +261,7 @@ pub(crate) fn build_static_vector(config: &TransformConfig) -> Vec<&'static str>

/// transform the content to markdown shortcut
pub fn transform_markdown(html: &str, commonmark: bool) -> String {
let mut tag_factory: HashMap<String, Box<dyn html2md::TagHandlerFactory>> = HashMap::new();
let tag = Box::new(IgnoreTagFactory {});

tag_factory.insert(String::from("script"), tag.clone());
tag_factory.insert(String::from("style"), tag.clone());
tag_factory.insert(String::from("noscript"), tag.clone());

if !commonmark {
tag_factory.insert(String::from("meta"), tag.clone());
}

tag_factory.insert(String::from("iframe"), tag);

let html = html2md::parse_html_custom(&html, &tag_factory, commonmark);
let html = aho_clean_markdown(&html);
html
html2md::rewrite_html_custom_with_url(&html, &None, commonmark, &None)
}

/// transform the content to text raw shortcut
Expand Down Expand Up @@ -393,18 +376,14 @@ pub fn transform_content(
match c.return_format {
ReturnFormat::Raw | ReturnFormat::Bytes => base_html,
ReturnFormat::CommonMark => {
let mut tag_factory: HashMap<String, Box<dyn html2md::TagHandlerFactory>> =
HashMap::new();
let tag = Box::new(IgnoreTagFactory {});

tag_factory.insert(String::from("script"), tag.clone());
tag_factory.insert(String::from("style"), tag.clone());
tag_factory.insert(String::from("noscript"), tag.clone());
let mut tag_factory = None;

if let Some(ignore) = ignore_tags {
let mut tag_factor = std::collections::HashSet::with_capacity(ignore.len());
for ignore_tag_name in ignore {
tag_factory.insert(ignore_tag_name.into(), tag.clone());
tag_factor.insert(ignore_tag_name.into());
}
tag_factory.replace(tag_factor);
}

let base_html = if c.clean_html {
Expand All @@ -413,32 +392,22 @@ pub fn transform_content(
base_html
};

tag_factory.insert(String::from("iframe"), tag);

let html = html2md::parse_html_custom_with_url(
html2md::rewrite_html_custom_with_url(
&base_html.trim(),
&tag_factory,
true,
&url_parsed,
);
let html = aho_clean_markdown(&html);

html
)
}
ReturnFormat::Markdown => {
let mut tag_factory: HashMap<String, Box<dyn html2md::TagHandlerFactory>> =
HashMap::new();

let tag = Box::new(IgnoreTagFactory {});

tag_factory.insert(String::from("script"), tag.clone());
tag_factory.insert(String::from("style"), tag.clone());
tag_factory.insert(String::from("noscript"), tag.clone());
let mut tag_factory = None;

if let Some(ignore) = ignore_tags {
let mut tag_factor = std::collections::HashSet::with_capacity(ignore.len());
for ignore_tag_name in ignore {
tag_factory.insert(ignore_tag_name.into(), tag.clone());
tag_factor.insert(ignore_tag_name.into());
}
tag_factory.replace(tag_factor);
}

let base_html = if c.clean_html {
Expand All @@ -447,17 +416,12 @@ pub fn transform_content(
base_html
};

tag_factory.insert(String::from("iframe"), tag);

let html = html2md::parse_html_custom_with_url(
html2md::rewrite_html_custom_with_url(
&base_html.trim(),
&tag_factory,
false,
url_parsed,
);
let html = aho_clean_markdown(&html);

html
&url_parsed,
)
}
ReturnFormat::Html2Text => {
if !base_html.is_empty() {
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.13.8"
version = "2.13.9"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.13.8"
version = "2.13.9"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down

0 comments on commit 0ee519e

Please sign in to comment.