diff --git a/Cargo.lock b/Cargo.lock index a1e825a60..a63a33a74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4295,7 +4295,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.13.88" +version = "2.13.89" dependencies = [ "ahash", "aho-corasick", @@ -4357,7 +4357,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.13.88" +version = "2.13.89" dependencies = [ "adblock", "async-tungstenite", @@ -4392,7 +4392,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.13.88" +version = "2.13.89" dependencies = [ "clap", "env_logger", @@ -4417,7 +4417,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.13.88" +version = "2.13.89" dependencies = [ "aho-corasick", "fast_html2md", @@ -4439,7 +4439,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.13.88" +version = "2.13.89" dependencies = [ "indexmap 1.9.3", "serde", @@ -4451,7 +4451,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.13.88" +version = "2.13.89" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index d6c082d93..595a2ee7b 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.13.88" +version = "2.13.89" authors = [ "j-mendez " ] diff --git a/spider/src/page.rs b/spider/src/page.rs index 571a82c53..fcd40600c 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -223,11 +223,11 @@ pub fn push_link>( let mut abs = convert_abs_path(b, href); let scheme = abs.scheme(); - if scheme == "https" || scheme == "http" { - if let Some(link_map) = links_pages { - link_map.insert(A::from(abs.as_str().to_string())); - } + if let Some(link_map) = links_pages { + link_map.insert(A::from(abs.as_str().to_string())); + } + if scheme == "https" || scheme == "http" { let host_name = abs.host_str(); let mut can_process = parent_host_match( host_name, @@ -570,7 +570,8 @@ impl Page { res.content_length().unwrap_or(DEFAULT_BYTE_CAPACITY) as usize, ); - if url != res.url().as_str() { + // this allows us to get subdomains and tlds when being used. + if url != res.url().as_str() && (r_settings.subdomains || r_settings.tld) { let domain = res.url().as_str(); let mut url = Box::new(CaseInsensitiveString::new(&url)); @@ -584,7 +585,13 @@ impl Page { ); }; - let base = domain_parsed.as_deref(); + // always use a base url. + let base = if domain_parsed.is_none() { + prior_domain + } else { + domain_parsed + } + .as_deref(); let parent_host = &selectors.1[0]; // the host schemes diff --git a/spider/src/website.rs b/spider/src/website.rs index b27193608..b10625117 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -6,7 +6,7 @@ use crate::configuration::{ use crate::features::chrome_common::RequestInterceptConfiguration; use crate::packages::robotparser::parser::RobotFileParser; use crate::page::{Page, PageLinkBuildSettings}; -use crate::utils::abs::parse_absolute_url; +use crate::utils::abs::{convert_abs_url, parse_absolute_url}; use crate::utils::{ emit_log, emit_log_shutdown, setup_website_selectors, spawn_set, spawn_task, AllowedDomainTypes, }; @@ -1288,12 +1288,19 @@ impl Website { &page_links_settings, &mut links, Some(&mut links_ssg), - &mut domain_parsed, + &mut domain_parsed, // original domain &mut self.domain_parsed, &mut links_pages, ) .await; + if self.domain_parsed.is_none() { + if let Some(mut domain_parsed) = domain_parsed.take() { + convert_abs_url(&mut domain_parsed); + self.domain_parsed.replace(domain_parsed); + } + } + let mut retry_count = self.configuration.retry; let domains_caseless = &self.configuration.external_domains_caseless; @@ -1366,9 +1373,6 @@ impl Website { if self.configuration.return_page_links { page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new); - if let Some(page_links) = page.page_links.as_mut() { - page_links.extend(links_ssg.clone()); - } } links.extend(links_ssg); diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 91cc466cd..e7534bfeb 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.13.88" +version = "2.13.89" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 7c96f4f1a..69a42f96a 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.13.88" +version = "2.13.89" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 8e2e7c039..10c7d2e05 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.13.88" +version = "2.13.89" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 7a690f055..6c15ff759 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.13.88" +version = "2.13.89" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 06dec1a78..ec205f6c1 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.13.88" +version = "2.13.89" authors = [ "j-mendez " ]