From 4d21b1f839b86157c7c894e21243567d4ad917e1 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Tue, 26 Nov 2024 19:30:07 -0500 Subject: [PATCH] chore(abs): fix page paths infinite handling --- Cargo.lock | 12 +++++----- examples/chrome_remote.rs | 2 -- spider/Cargo.toml | 2 +- spider/src/page.rs | 4 ++-- spider/src/utils/abs.rs | 3 ++- spider/src/website.rs | 38 ++++++++++++++++++++++++------- spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 11 files changed, 46 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 023525839..b9bfef388 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4304,7 +4304,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.13.63" +version = "2.13.65" dependencies = [ "ahash", "aho-corasick", @@ -4367,7 +4367,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.13.63" +version = "2.13.65" dependencies = [ "adblock", "async-tungstenite", @@ -4402,7 +4402,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.13.63" +version = "2.13.65" dependencies = [ "clap", "env_logger", @@ -4427,7 +4427,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.13.63" +version = "2.13.65" dependencies = [ "aho-corasick", "fast_html2md", @@ -4449,7 +4449,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.13.63" +version = "2.13.65" dependencies = [ "indexmap 1.9.3", "serde", @@ -4461,7 +4461,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.13.63" +version = "2.13.65" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/chrome_remote.rs b/examples/chrome_remote.rs index f6577052b..84c006c39 100644 --- a/examples/chrome_remote.rs +++ b/examples/chrome_remote.rs @@ -48,8 +48,6 @@ async fn crawl_website(url: &str) -> Result<()> { #[tokio::main] async fn main() -> Result<()> { - console_subscriber::init(); - let _ = tokio::join!( crawl_website("https://choosealicense.com"), crawl_website("https://jeffmendez.com"), diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 7d3e61157..d59001be6 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.13.63" +version = "2.13.65" authors = [ "j-mendez " ] diff --git a/spider/src/page.rs b/spider/src/page.rs index ba71cd7f5..07cbac260 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -126,7 +126,7 @@ pub struct Page { /// The bytes of the resource. html: Option>, /// Base absolute url for page. - base: Option, + pub(crate) base: Option, /// The raw url for the page. Useful since Url::parse adds a trailing slash. url: String, #[cfg(feature = "headers")] @@ -856,7 +856,7 @@ impl Page { ) -> Vec { match &page.chrome_page { Some(chrome_page) => { - let format = + let format: chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat = chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::from( format, ); diff --git a/spider/src/utils/abs.rs b/spider/src/utils/abs.rs index 0102f3815..e29fbb4f8 100644 --- a/spider/src/utils/abs.rs +++ b/spider/src/utils/abs.rs @@ -57,7 +57,7 @@ pub(crate) fn parse_absolute_url(url: &str) -> Option> { pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url { let href = href.trim(); - if href.is_empty() { + if href.is_empty() || href == "#" { return base.clone(); } @@ -87,6 +87,7 @@ pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url { return base.clone(); } + // valid protocol to take absolute if protocol_slice.len() >= protocol_end + 3 { let protocol_slice = &href[..protocol_end + 3]; // +3 to include "://" diff --git a/spider/src/website.rs b/spider/src/website.rs index 15471ec7c..ded78a816 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -280,7 +280,7 @@ impl Website { CaseInsensitiveString::new(&string_concat!("https://", url)).into() }; - let domain_parsed = parse_absolute_url(&url); + let domain_parsed: Option> = parse_absolute_url(&url); Self { configuration: Configuration::new().into(), @@ -2400,6 +2400,7 @@ impl Website { self.configuration.clone(), self.url.inner().to_string(), context_id.clone(), + self.domain_parsed.clone(), )); let add_external = shared.3.len() > 0; @@ -2551,12 +2552,18 @@ impl Website { page.set_external(shared.3.clone()); } + let prev_domain = page.base; + + page.base = shared.9.as_deref().cloned(); + let links = if full_resources { page.links_full(&shared.1).await } else { page.links(&shared.1).await }; + page.base = prev_domain; + if return_page_links { page.page_links = if links.is_empty() { None @@ -2871,6 +2878,7 @@ impl Website { browser, self.configuration.clone(), context_id.clone(), + self.domain_parsed.clone(), )); let add_external = self.configuration.external_domains_caseless.len() > 0; @@ -2984,6 +2992,10 @@ impl Website { ); } + let prev_domain = page.base; + + page.base = shared.7.as_deref().cloned(); + let links = page .smart_links( &shared.1, &shared.4, &shared.5, @@ -2991,6 +3003,8 @@ impl Website { ) .await; + page.base = prev_domain; + if return_page_links { page.page_links = if links.is_empty() { None @@ -3103,6 +3117,8 @@ impl Website { }; let domain = self.url.inner().as_str(); + self.domain_parsed = parse_absolute_url(&domain); + let mut interval = tokio::time::interval(Duration::from_millis(15)); let (sitemap_path, needs_trailing) = match &self.configuration.sitemap_url { Some(sitemap_path) => { @@ -3145,7 +3161,7 @@ impl Website { if !self.handle_process(handle, &mut interval, async {}).await { break 'outer; } - let (tx, mut rx) = tokio::sync::mpsc::channel::(32); + let (tx, mut rx) = tokio::sync::mpsc::channel::(100); let shared = shared.clone(); @@ -3236,11 +3252,10 @@ impl Website { retry_count -= 1; } - match tx.reserve().await { - Ok(permit) => { - permit.send(page); - } - _ => (), + if let Ok(permit) = + tx.reserve().await + { + permit.send(page); } }); } @@ -3277,7 +3292,10 @@ impl Website { if let Ok(mut handle) = handles.await { for page in handle.iter_mut() { + let prev_domain = page.base; + page.base = self.domain_parsed.as_deref().cloned(); let links = page.links(&selectors).await; + page.base = prev_domain; self.extra_links.extend(links) } if scrape { @@ -3340,6 +3358,7 @@ impl Website { match self.setup_browser().await { Some((browser, browser_handle, mut context_id)) => { let domain = self.url.inner().as_str(); + self.domain_parsed = parse_absolute_url(&domain); let mut interval = tokio::time::interval(Duration::from_millis(15)); let (sitemap_path, needs_trailing) = match &self.configuration.sitemap_url { Some(sitemap_path) => { @@ -3565,7 +3584,10 @@ impl Website { if let Ok(mut handle) = handles.await { for page in handle.iter_mut() { - self.extra_links.extend(page.links(&selectors).await) + let prev_domain = page.base; + page.base = self.domain_parsed.as_deref().cloned(); + self.extra_links.extend(page.links(&selectors).await); + page.base = prev_domain; } if scrape { match self.pages.as_mut() { diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 98687b29e..96700437b 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.13.63" +version = "2.13.65" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 788b05c2a..02292cdad 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.13.63" +version = "2.13.65" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 101de401a..346bb2c50 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.13.63" +version = "2.13.65" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 436135a9d..03e47067b 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.13.63" +version = "2.13.65" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index d77ff60cc..1ecf9d3f4 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.13.63" +version = "2.13.65" authors = [ "j-mendez " ]