From 5322150a7097e590285ed86a6f508f4a4957477d Mon Sep 17 00:00:00 2001 From: j-mendez Date: Mon, 18 Mar 2024 16:41:09 -0400 Subject: [PATCH] chore(chrome): fix semaphore limiting scrape --- Cargo.lock | 6 +++--- spider/Cargo.toml | 2 +- spider/README.md | 22 +++++++++++----------- spider/src/website.rs | 19 +++++++++++-------- spider_cli/Cargo.toml | 4 ++-- spider_worker/Cargo.toml | 4 ++-- 6 files changed, 30 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2f7a8120e..1f567f9bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2906,7 +2906,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.85.3" +version = "1.85.4" dependencies = [ "ahash", "async-trait", @@ -2951,7 +2951,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.85.3" +version = "1.85.4" dependencies = [ "clap", "env_logger", @@ -2974,7 +2974,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.85.3" +version = "1.85.4" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index a88a6f402..39db186a4 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.85.3" +version = "1.85.4" authors = [ "madeindjs ", "j-mendez ", diff --git a/spider/README.md b/spider/README.md index 727527ed3..a64d0877d 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.85.3" +spider = "1.85.4" ``` And then the code: @@ -93,7 +93,7 @@ We have the following optional feature flags. ```toml [dependencies] -spider = { version = "1.85.3", features = ["regex", "ua_generator"] } +spider = { version = "1.85.4", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -135,7 +135,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.85.3", features = ["decentralized"] } +spider = { version = "1.85.4", features = ["decentralized"] } ``` ```sh @@ -166,7 +166,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.85.3", features = ["sync"] } +spider = { version = "1.85.4", features = ["sync"] } ``` ```rust,no_run @@ -196,7 +196,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.85.3", features = ["regex"] } +spider = { version = "1.85.4", features = ["regex"] } ``` ```rust,no_run @@ -223,7 +223,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.85.3", features = ["control"] } +spider = { version = "1.85.4", features = ["control"] } ``` ```rust @@ -293,7 +293,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "1.85.3", features = ["sync", "cron"] } +spider = { version = "1.85.4", features = ["sync", "cron"] } ``` ```rust,no_run @@ -332,7 +332,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network ```toml [dependencies] -spider = { version = "1.85.3", features = ["chrome", "chrome_intercept"] } +spider = { version = "1.85.4", features = ["chrome", "chrome_intercept"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. @@ -362,7 +362,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`] ```toml [dependencies] -spider = { version = "1.85.3", features = ["cache"] } +spider = { version = "1.85.4", features = ["cache"] } ``` You need to set `website.cache` to true to enable as well. @@ -393,7 +393,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be ```toml [dependencies] -spider = { version = "1.85.3", features = ["smart"] } +spider = { version = "1.85.4", features = ["smart"] } ``` ```rust,no_run @@ -419,7 +419,7 @@ Set a depth limit to prevent forwarding. ```toml [dependencies] -spider = { version = "1.85.3", features = ["budget"] } +spider = { version = "1.85.4", features = ["budget"] } ``` ```rust,no_run diff --git a/spider/src/website.rs b/spider/src/website.rs index 8dba6c7f9..93e1c866f 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -2054,7 +2054,6 @@ impl Website { let shared = shared.clone(); set.spawn(async move { - drop(permit); let page_resource = crate::utils::fetch_page_html_raw( link.as_ref(), &shared.0, @@ -2076,6 +2075,7 @@ impl Website { } else { page.links(&shared.1).await }; + drop(permit); (link, page, page_links) }); @@ -2915,7 +2915,6 @@ impl Website { let shared = shared.clone(); set.spawn(async move { - drop(permit); let page_resource = crate::utils::fetch_page_html(link.as_ref(), &shared.0) .await; @@ -2935,6 +2934,7 @@ impl Website { } else { page.links(&shared.1).await }; + drop(permit); (link, page, page_links) }); @@ -3083,10 +3083,9 @@ impl Website { let shared = shared.clone(); set.spawn(async move { - drop(permit); let target_url = link.as_ref(); - match shared.4.new_page(target_url).await { + let r = match shared.4.new_page(target_url).await { Ok(new_page) => { match shared.5.evaluate_on_new_document { Some(ref script) => { @@ -3174,7 +3173,9 @@ impl Website { (link, page, Default::default()) } - } + }; + drop(permit); + r }); match q.as_mut() { @@ -3303,10 +3304,9 @@ impl Website { self.setup_chrome_interception(&new_page).await; set.spawn(async move { - drop(permit); let target_url = link.as_ref(); - match shared.5.new_page(target_url).await { + let r = match shared.5.new_page(target_url).await { Ok(new_page) => { let new_page = configure_browser(new_page, &shared.6) @@ -3385,7 +3385,10 @@ impl Website { (link, page, Default::default()) } - } + }; + drop(permit); + + r }); } _ => (), diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 330343589..d864595eb 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.85.3" +version = "1.85.4" authors = [ "madeindjs ", "j-mendez ", @@ -29,7 +29,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.85.3" +version = "1.85.4" path = "../spider" [[bin]] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index c4e2c3bff..4c4ca358c 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.85.3" +version = "1.85.4" authors = [ "madeindjs ", "j-mendez ", @@ -25,7 +25,7 @@ lazy_static = "1.4.0" env_logger = "0.11.3" [dependencies.spider] -version = "1.85.3" +version = "1.85.4" path = "../spider" features = ["serde", "flexbuffers"]