From 2417953dc9a3ea687c82abcf0094a9e52706d5d3 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Tue, 3 Dec 2024 15:21:44 -0500 Subject: [PATCH] chore(page): ignore js void --- Cargo.lock | 12 ++--- examples/chrome_remote.rs | 2 +- spider/Cargo.toml | 2 +- spider/src/features/chrome.rs | 2 +- spider/src/page.rs | 76 +++++++++++++++++-------------- spider/src/utils/abs.rs | 12 ++--- spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 11 files changed, 60 insertions(+), 56 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b39dad4ca..35ae0d929 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4297,7 +4297,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.13.99" +version = "2.13.100" dependencies = [ "ahash", "aho-corasick", @@ -4359,7 +4359,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.13.99" +version = "2.13.100" dependencies = [ "adblock", "async-tungstenite", @@ -4394,7 +4394,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.13.99" +version = "2.13.100" dependencies = [ "clap", "env_logger", @@ -4419,7 +4419,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.13.99" +version = "2.13.100" dependencies = [ "aho-corasick", "fast_html2md", @@ -4441,7 +4441,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.13.99" +version = "2.13.100" dependencies = [ "indexmap 1.9.3", "serde", @@ -4453,7 +4453,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.13.99" +version = "2.13.100" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/chrome_remote.rs b/examples/chrome_remote.rs index c40e33b43..84c006c39 100644 --- a/examples/chrome_remote.rs +++ b/examples/chrome_remote.rs @@ -9,7 +9,7 @@ use std::io::Result; async fn crawl_website(url: &str) -> Result<()> { let mut website: Website = Website::new(url) - .with_limit(1) + .with_limit(500) .with_chrome_intercept(RequestInterceptConfiguration::new(true)) .with_stealth(true) .with_fingerprint(true) diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 6eff39caf..94017d842 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.13.99" +version = "2.13.100" authors = [ "j-mendez " ] diff --git a/spider/src/features/chrome.rs b/spider/src/features/chrome.rs index 589a9052f..479cb5c95 100644 --- a/spider/src/features/chrome.rs +++ b/spider/src/features/chrome.rs @@ -278,7 +278,7 @@ pub async fn setup_browser_configuration( browser_config.ignore_javascript = config.chrome_intercept.block_javascript; browser_config.ignore_ads = config.chrome_intercept.block_ads; browser_config.ignore_stylesheets = config.chrome_intercept.block_stylesheets; - browser_config.ignore_analytics = config.chrome_intercept.block_analytics; + browser_config.ignore_analytics = config.chrome_intercept.block_analytics; browser_config.extra_headers = match config.headers { Some(ref headers) => { let hm = diff --git a/spider/src/page.rs b/spider/src/page.rs index 1602bed19..25abee13c 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -221,54 +221,60 @@ pub fn push_link>( ) { if let Some(b) = base { let mut abs = convert_abs_path(b, href); - let scheme = abs.scheme(); + let new_page = abs != **b; if let Some(link_map) = links_pages { - link_map.insert(A::from(abs.as_str().to_string())); + link_map.insert(A::from( + (if new_page { abs.as_str() } else { href }).to_string(), + )); } - if scheme == "https" || scheme == "http" { - let host_name = abs.host_str(); - let mut can_process = parent_host_match( - host_name, - base_domain, - parent_host, - base_input_domain, - sub_matcher, - ); - - if !can_process && host_name.is_some() && !external_domains_caseless.is_empty() { - can_process = external_domains_caseless - .contains::(&host_name.unwrap_or_default().into()) - || external_domains_caseless - .contains::(&CASELESS_WILD_CARD); - } + if new_page { + let scheme = abs.scheme(); + if scheme == "https" || scheme == "http" { + let host_name = abs.host_str(); + let mut can_process = parent_host_match( + host_name, + base_domain, + parent_host, + base_input_domain, + sub_matcher, + ); - if can_process { - if abs.scheme() != parent_host_scheme.as_str() { - let _ = abs.set_scheme(parent_host_scheme.as_str()); + if !can_process && host_name.is_some() && !external_domains_caseless.is_empty() { + can_process = external_domains_caseless + .contains::(&host_name.unwrap_or_default().into()) + || external_domains_caseless + .contains::(&CASELESS_WILD_CARD); } - let hchars = abs.path(); + if can_process { + if abs.scheme() != parent_host_scheme.as_str() { + let _ = abs.set_scheme(parent_host_scheme.as_str()); + } - if let Some(position) = hchars.rfind('.') { - let hlen = hchars.len(); - let has_asset = hlen - position; + let hchars = abs.path(); - if has_asset >= 3 { - let next_position = position + 1; + if let Some(position) = hchars.rfind('.') { + let hlen = hchars.len(); + let has_asset = hlen - position; - if !full_resources - && !ONLY_RESOURCES - .contains::(&hchars[next_position..].into()) - { - can_process = false; + if has_asset >= 3 { + let next_position = position + 1; + + if !full_resources + && !ONLY_RESOURCES.contains::( + &hchars[next_position..].into(), + ) + { + can_process = false; + } } } - } - if can_process { - map.insert(abs.as_str().to_string().into()); + if can_process { + map.insert(abs.as_str().to_string().into()); + } } } } diff --git a/spider/src/utils/abs.rs b/spider/src/utils/abs.rs index e29fbb4f8..c7bbc7313 100644 --- a/spider/src/utils/abs.rs +++ b/spider/src/utils/abs.rs @@ -57,21 +57,19 @@ pub(crate) fn parse_absolute_url(url: &str) -> Option> { pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url { let href = href.trim(); - if href.is_empty() || href == "#" { + if href.is_empty() || href == "#" || href == "javascript:void(0);" { return base.clone(); } // handle absolute urls. if !href.starts_with("/") { - let length = href.len(); - - let protocol_slice = if length >= 8 && href.is_char_boundary(8) { + let protocol_slice = if href.is_char_boundary(8) { &href[0..8] - } else if length >= 7 && href.is_char_boundary(7) { + } else if href.is_char_boundary(7) { &href[0..7] - } else if length >= 6 && href.is_char_boundary(6) { + } else if href.is_char_boundary(6) { &href[0..6] - } else if length >= 5 && href.is_char_boundary(5) { + } else if href.is_char_boundary(5) { &href[0..5] } else { "" diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 4e3a46361..8cc6ebe3a 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.13.99" +version = "2.13.100" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index e37b734e8..5895dd85a 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.13.99" +version = "2.13.100" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 7a8dee6a8..ded825c24 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.13.99" +version = "2.13.100" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index fa351a0be..03dcc40cd 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.13.99" +version = "2.13.100" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 21d83d2dc..19de97abf 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.13.99" +version = "2.13.100" authors = [ "j-mendez " ]