Skip to content

Commit

Permalink
chore(page): ignore js void
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 3, 2024
1 parent d7175f1 commit 2417953
Show file tree
Hide file tree
Showing 11 changed files with 60 additions and 56 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion examples/chrome_remote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use std::io::Result;

async fn crawl_website(url: &str) -> Result<()> {
let mut website: Website = Website::new(url)
.with_limit(1)
.with_limit(500)
.with_chrome_intercept(RequestInterceptConfiguration::new(true))
.with_stealth(true)
.with_fingerprint(true)
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.13.99"
version = "2.13.100"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider/src/features/chrome.rs
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ pub async fn setup_browser_configuration(
browser_config.ignore_javascript = config.chrome_intercept.block_javascript;
browser_config.ignore_ads = config.chrome_intercept.block_ads;
browser_config.ignore_stylesheets = config.chrome_intercept.block_stylesheets;
browser_config.ignore_analytics = config.chrome_intercept.block_analytics;
browser_config.ignore_analytics = config.chrome_intercept.block_analytics;
browser_config.extra_headers = match config.headers {
Some(ref headers) => {
let hm =
Expand Down
76 changes: 41 additions & 35 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,54 +221,60 @@ pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(
) {
if let Some(b) = base {
let mut abs = convert_abs_path(b, href);
let scheme = abs.scheme();
let new_page = abs != **b;

if let Some(link_map) = links_pages {
link_map.insert(A::from(abs.as_str().to_string()));
link_map.insert(A::from(
(if new_page { abs.as_str() } else { href }).to_string(),
));
}

if scheme == "https" || scheme == "http" {
let host_name = abs.host_str();
let mut can_process = parent_host_match(
host_name,
base_domain,
parent_host,
base_input_domain,
sub_matcher,
);

if !can_process && host_name.is_some() && !external_domains_caseless.is_empty() {
can_process = external_domains_caseless
.contains::<CaseInsensitiveString>(&host_name.unwrap_or_default().into())
|| external_domains_caseless
.contains::<CaseInsensitiveString>(&CASELESS_WILD_CARD);
}
if new_page {
let scheme = abs.scheme();
if scheme == "https" || scheme == "http" {
let host_name = abs.host_str();
let mut can_process = parent_host_match(
host_name,
base_domain,
parent_host,
base_input_domain,
sub_matcher,
);

if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
if !can_process && host_name.is_some() && !external_domains_caseless.is_empty() {
can_process = external_domains_caseless
.contains::<CaseInsensitiveString>(&host_name.unwrap_or_default().into())
|| external_domains_caseless
.contains::<CaseInsensitiveString>(&CASELESS_WILD_CARD);
}

let hchars = abs.path();
if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}

if let Some(position) = hchars.rfind('.') {
let hlen = hchars.len();
let has_asset = hlen - position;
let hchars = abs.path();

if has_asset >= 3 {
let next_position = position + 1;
if let Some(position) = hchars.rfind('.') {
let hlen = hchars.len();
let has_asset = hlen - position;

if !full_resources
&& !ONLY_RESOURCES
.contains::<CaseInsensitiveString>(&hchars[next_position..].into())
{
can_process = false;
if has_asset >= 3 {
let next_position = position + 1;

if !full_resources
&& !ONLY_RESOURCES.contains::<CaseInsensitiveString>(
&hchars[next_position..].into(),
)
{
can_process = false;
}
}
}
}

if can_process {
map.insert(abs.as_str().to_string().into());
if can_process {
map.insert(abs.as_str().to_string().into());
}
}
}
}
Expand Down
12 changes: 5 additions & 7 deletions spider/src/utils/abs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,21 +57,19 @@ pub(crate) fn parse_absolute_url(url: &str) -> Option<Box<Url>> {
pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url {
let href = href.trim();

if href.is_empty() || href == "#" {
if href.is_empty() || href == "#" || href == "javascript:void(0);" {
return base.clone();
}

// handle absolute urls.
if !href.starts_with("/") {
let length = href.len();

let protocol_slice = if length >= 8 && href.is_char_boundary(8) {
let protocol_slice = if href.is_char_boundary(8) {
&href[0..8]
} else if length >= 7 && href.is_char_boundary(7) {
} else if href.is_char_boundary(7) {
&href[0..7]
} else if length >= 6 && href.is_char_boundary(6) {
} else if href.is_char_boundary(6) {
&href[0..6]
} else if length >= 5 && href.is_char_boundary(5) {
} else if href.is_char_boundary(5) {
&href[0..5]
} else {
""
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.13.99"
version = "2.13.100"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.13.99"
version = "2.13.100"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.13.99"
version = "2.13.100"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.13.99"
version = "2.13.100"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.13.99"
version = "2.13.100"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down

0 comments on commit 2417953

Please sign in to comment.