Skip to content

Commit

Permalink
chore(page): fix url encode handling mismatch
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 1, 2024
1 parent f184914 commit da531e9
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 31 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.13.90"
version = "2.13.91"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
51 changes: 32 additions & 19 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,29 @@ pub(crate) fn get_charset_from_content_type(
None
}

/// Check if urls are the same without the trailing slashes.
fn exact_url_match(url: &str, target_url: &str) -> bool {
let end_target_slash = target_url.ends_with('/');
let main_slash = url.ends_with('/');

if end_target_slash && !main_slash {
strip_trailing_slash(target_url) == url
} else if !end_target_slash && main_slash {
url == strip_trailing_slash(target_url)
} else {
url == target_url
}
}

/// Strip end matching
fn strip_trailing_slash(s: &str) -> &str {
if s.ends_with('/') {
s.trim_end_matches('/')
} else {
s
}
}

impl Page {
/// Instantiate a new page and gather the html repro of standard fetch_page_html.
#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
Expand Down Expand Up @@ -573,26 +596,16 @@ impl Page {
let target_url = res.url().as_str();

// handle redirects
if url != target_url {
if url != target_url && !exact_url_match(&url, &target_url) {
let mut url = Box::new(CaseInsensitiveString::new(&url));
let end_target_slash = target_url.ends_with("/");
let main_slash = url.ends_with("/");

let exact_match = end_target_slash
&& !main_slash
&& target_url[..target_url.len() - 1] == *url
|| !end_target_slash && main_slash && url[..url.len() - 1] == *target_url;

if !exact_match {
modify_selectors(
prior_domain,
target_url,
domain_parsed,
&mut url,
selectors,
AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld),
);
}
modify_selectors(
prior_domain,
target_url,
domain_parsed,
&mut url,
selectors,
AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld),
);
};

// always use a base url.
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.13.90"
version = "2.13.91"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.13.90"
version = "2.13.91"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.13.90"
version = "2.13.91"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.13.90"
version = "2.13.91"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.13.90"
version = "2.13.91"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down

0 comments on commit da531e9

Please sign in to comment.