From da531e9905c5659b657f07028fb305ad67f32fc3 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sun, 1 Dec 2024 11:25:48 -0500 Subject: [PATCH] chore(page): fix url encode handling mismatch --- Cargo.lock | 12 ++++---- spider/Cargo.toml | 2 +- spider/src/page.rs | 51 +++++++++++++++++++------------ spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 8 files changed, 44 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4e900d8e0..d7bae818b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4295,7 +4295,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.13.90" +version = "2.13.91" dependencies = [ "ahash", "aho-corasick", @@ -4357,7 +4357,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.13.90" +version = "2.13.91" dependencies = [ "adblock", "async-tungstenite", @@ -4392,7 +4392,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.13.90" +version = "2.13.91" dependencies = [ "clap", "env_logger", @@ -4417,7 +4417,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.13.90" +version = "2.13.91" dependencies = [ "aho-corasick", "fast_html2md", @@ -4439,7 +4439,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.13.90" +version = "2.13.91" dependencies = [ "indexmap 1.9.3", "serde", @@ -4451,7 +4451,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.13.90" +version = "2.13.91" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index f918b35df..cb37d490b 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.13.90" +version = "2.13.91" authors = [ "j-mendez " ] diff --git a/spider/src/page.rs b/spider/src/page.rs index a85f341ea..1602bed19 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -526,6 +526,29 @@ pub(crate) fn get_charset_from_content_type( None } +/// Check if urls are the same without the trailing slashes. +fn exact_url_match(url: &str, target_url: &str) -> bool { + let end_target_slash = target_url.ends_with('/'); + let main_slash = url.ends_with('/'); + + if end_target_slash && !main_slash { + strip_trailing_slash(target_url) == url + } else if !end_target_slash && main_slash { + url == strip_trailing_slash(target_url) + } else { + url == target_url + } +} + +/// Strip end matching +fn strip_trailing_slash(s: &str) -> &str { + if s.ends_with('/') { + s.trim_end_matches('/') + } else { + s + } +} + impl Page { /// Instantiate a new page and gather the html repro of standard fetch_page_html. #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))] @@ -573,26 +596,16 @@ impl Page { let target_url = res.url().as_str(); // handle redirects - if url != target_url { + if url != target_url && !exact_url_match(&url, &target_url) { let mut url = Box::new(CaseInsensitiveString::new(&url)); - let end_target_slash = target_url.ends_with("/"); - let main_slash = url.ends_with("/"); - - let exact_match = end_target_slash - && !main_slash - && target_url[..target_url.len() - 1] == *url - || !end_target_slash && main_slash && url[..url.len() - 1] == *target_url; - - if !exact_match { - modify_selectors( - prior_domain, - target_url, - domain_parsed, - &mut url, - selectors, - AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld), - ); - } + modify_selectors( + prior_domain, + target_url, + domain_parsed, + &mut url, + selectors, + AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld), + ); }; // always use a base url. diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 41ea5773f..86f0de976 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.13.90" +version = "2.13.91" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 6ef68c4ed..698384b36 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.13.90" +version = "2.13.91" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 85f350811..73e1c2fa7 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.13.90" +version = "2.13.91" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 39aa5a8ef..5475cd09a 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.13.90" +version = "2.13.91" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 2b5d828fc..2bb1c7a30 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.13.90" +version = "2.13.91" authors = [ "j-mendez " ]