Skip to content

Commit

Permalink
chore(abs): fix page paths infinite handling
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 27, 2024
1 parent a1c3190 commit 4d21b1f
Show file tree
Hide file tree
Showing 11 changed files with 46 additions and 25 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions examples/chrome_remote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ async fn crawl_website(url: &str) -> Result<()> {

#[tokio::main]
async fn main() -> Result<()> {
console_subscriber::init();

let _ = tokio::join!(
crawl_website("https://choosealicense.com"),
crawl_website("https://jeffmendez.com"),
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.13.63"
version = "2.13.65"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
4 changes: 2 additions & 2 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ pub struct Page {
/// The bytes of the resource.
html: Option<Box<Bytes>>,
/// Base absolute url for page.
base: Option<Url>,
pub(crate) base: Option<Url>,
/// The raw url for the page. Useful since Url::parse adds a trailing slash.
url: String,
#[cfg(feature = "headers")]
Expand Down Expand Up @@ -856,7 +856,7 @@ impl Page {
) -> Vec<u8> {
match &page.chrome_page {
Some(chrome_page) => {
let format =
let format: chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat =
chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::from(
format,
);
Expand Down
3 changes: 2 additions & 1 deletion spider/src/utils/abs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ pub(crate) fn parse_absolute_url(url: &str) -> Option<Box<Url>> {
pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url {
let href = href.trim();

if href.is_empty() {
if href.is_empty() || href == "#" {
return base.clone();
}

Expand Down Expand Up @@ -87,6 +87,7 @@ pub(crate) fn convert_abs_path(base: &Url, href: &str) -> Url {
return base.clone();
}

// valid protocol to take absolute
if protocol_slice.len() >= protocol_end + 3 {
let protocol_slice = &href[..protocol_end + 3]; // +3 to include "://"

Expand Down
38 changes: 30 additions & 8 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ impl Website {
CaseInsensitiveString::new(&string_concat!("https://", url)).into()
};

let domain_parsed = parse_absolute_url(&url);
let domain_parsed: Option<Box<Url>> = parse_absolute_url(&url);

Self {
configuration: Configuration::new().into(),
Expand Down Expand Up @@ -2400,6 +2400,7 @@ impl Website {
self.configuration.clone(),
self.url.inner().to_string(),
context_id.clone(),
self.domain_parsed.clone(),
));

let add_external = shared.3.len() > 0;
Expand Down Expand Up @@ -2551,12 +2552,18 @@ impl Website {
page.set_external(shared.3.clone());
}

let prev_domain = page.base;

page.base = shared.9.as_deref().cloned();

let links = if full_resources {
page.links_full(&shared.1).await
} else {
page.links(&shared.1).await
};

page.base = prev_domain;

if return_page_links {
page.page_links = if links.is_empty() {
None
Expand Down Expand Up @@ -2871,6 +2878,7 @@ impl Website {
browser,
self.configuration.clone(),
context_id.clone(),
self.domain_parsed.clone(),
));

let add_external = self.configuration.external_domains_caseless.len() > 0;
Expand Down Expand Up @@ -2984,13 +2992,19 @@ impl Website {
);
}

let prev_domain = page.base;

page.base = shared.7.as_deref().cloned();

let links = page
.smart_links(
&shared.1, &shared.4, &shared.5,
&shared.6,
)
.await;

page.base = prev_domain;

if return_page_links {
page.page_links = if links.is_empty() {
None
Expand Down Expand Up @@ -3103,6 +3117,8 @@ impl Website {
};

let domain = self.url.inner().as_str();
self.domain_parsed = parse_absolute_url(&domain);

let mut interval = tokio::time::interval(Duration::from_millis(15));
let (sitemap_path, needs_trailing) = match &self.configuration.sitemap_url {
Some(sitemap_path) => {
Expand Down Expand Up @@ -3145,7 +3161,7 @@ impl Website {
if !self.handle_process(handle, &mut interval, async {}).await {
break 'outer;
}
let (tx, mut rx) = tokio::sync::mpsc::channel::<Page>(32);
let (tx, mut rx) = tokio::sync::mpsc::channel::<Page>(100);

let shared = shared.clone();

Expand Down Expand Up @@ -3236,11 +3252,10 @@ impl Website {
retry_count -= 1;
}

match tx.reserve().await {
Ok(permit) => {
permit.send(page);
}
_ => (),
if let Ok(permit) =
tx.reserve().await
{
permit.send(page);
}
});
}
Expand Down Expand Up @@ -3277,7 +3292,10 @@ impl Website {

if let Ok(mut handle) = handles.await {
for page in handle.iter_mut() {
let prev_domain = page.base;
page.base = self.domain_parsed.as_deref().cloned();
let links = page.links(&selectors).await;
page.base = prev_domain;
self.extra_links.extend(links)
}
if scrape {
Expand Down Expand Up @@ -3340,6 +3358,7 @@ impl Website {
match self.setup_browser().await {
Some((browser, browser_handle, mut context_id)) => {
let domain = self.url.inner().as_str();
self.domain_parsed = parse_absolute_url(&domain);
let mut interval = tokio::time::interval(Duration::from_millis(15));
let (sitemap_path, needs_trailing) = match &self.configuration.sitemap_url {
Some(sitemap_path) => {
Expand Down Expand Up @@ -3565,7 +3584,10 @@ impl Website {

if let Ok(mut handle) = handles.await {
for page in handle.iter_mut() {
self.extra_links.extend(page.links(&selectors).await)
let prev_domain = page.base;
page.base = self.domain_parsed.as_deref().cloned();
self.extra_links.extend(page.links(&selectors).await);
page.base = prev_domain;
}
if scrape {
match self.pages.as_mut() {
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.13.63"
version = "2.13.65"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.13.63"
version = "2.13.65"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.13.63"
version = "2.13.65"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.13.63"
version = "2.13.65"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.13.63"
version = "2.13.65"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down

0 comments on commit 4d21b1f

Please sign in to comment.