From ec5634bdb685aa287fad42a0d0ddec6db915a3f6 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Thu, 1 Aug 2024 09:27:51 -0400 Subject: [PATCH] chore(chrome): fix fingerprint and initial eval scripting --- Cargo.lock | 6 +-- spider/Cargo.toml | 2 +- spider/README.md | 24 +++++----- spider/src/page.rs | 31 +++++++------ spider/src/website.rs | 99 ++++++++++++++++++++++++---------------- spider_cli/Cargo.toml | 4 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 4 +- 8 files changed, 98 insertions(+), 74 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b814c31dc..d6d0e613e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3465,7 +3465,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.99.14" +version = "1.99.16" dependencies = [ "ahash", "async-openai", @@ -3521,7 +3521,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.99.14" +version = "1.99.16" dependencies = [ "clap", "env_logger", @@ -3552,7 +3552,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.99.14" +version = "1.99.16" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 44f8a69d0..53acf7585 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.99.14" +version = "1.99.16" authors = [ "j-mendez " ] diff --git a/spider/README.md b/spider/README.md index 01b4e095f..877de9b23 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.99.14" +spider = "1.99.16" ``` And then the code: @@ -93,7 +93,7 @@ We have the following optional feature flags. ```toml [dependencies] -spider = { version = "1.99.14", features = ["regex", "ua_generator"] } +spider = { version = "1.99.16", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -138,7 +138,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.99.14", features = ["decentralized"] } +spider = { version = "1.99.16", features = ["decentralized"] } ``` ```sh @@ -169,7 +169,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.99.14", features = ["sync"] } +spider = { version = "1.99.16", features = ["sync"] } ``` ```rust,no_run @@ -200,7 +200,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.99.14", features = ["regex"] } +spider = { version = "1.99.16", features = ["regex"] } ``` ```rust,no_run @@ -227,7 +227,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.99.14", features = ["control"] } +spider = { version = "1.99.16", features = ["control"] } ``` ```rust @@ -297,7 +297,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "1.99.14", features = ["sync", "cron"] } +spider = { version = "1.99.16", features = ["sync", "cron"] } ``` ```rust,no_run @@ -336,7 +336,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network ```toml [dependencies] -spider = { version = "1.99.14", features = ["chrome", "chrome_intercept"] } +spider = { version = "1.99.16", features = ["chrome", "chrome_intercept"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. @@ -366,7 +366,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`] ```toml [dependencies] -spider = { version = "1.99.14", features = ["cache"] } +spider = { version = "1.99.16", features = ["cache"] } ``` You need to set `website.cache` to true to enable as well. @@ -397,7 +397,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be ```toml [dependencies] -spider = { version = "1.99.14", features = ["smart"] } +spider = { version = "1.99.16", features = ["smart"] } ``` ```rust,no_run @@ -423,7 +423,7 @@ Use OpenAI to generate dynamic scripts to drive the browser done with the featur ```toml [dependencies] -spider = { version = "1.99.14", features = ["openai"] } +spider = { version = "1.99.16", features = ["openai"] } ``` ```rust @@ -449,7 +449,7 @@ Set a depth limit to prevent forwarding. ```toml [dependencies] -spider = { version = "1.99.14" } +spider = { version = "1.99.16" } ``` ```rust,no_run diff --git a/spider/src/page.rs b/spider/src/page.rs index e8bebf4f3..83666e254 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -1019,20 +1019,25 @@ impl Page { .evaluate_on_new_document { Some(ref script) => { - let _ = new_page - .evaluate_on_new_document( - script.as_str(), - ) - .await; + if configuration.fingerprint { + let _ = new_page + .evaluate_on_new_document(string_concat!( + crate::features::chrome::FP_JS, + script.as_str() + )) + .await; + } else { + let _ = + new_page.evaluate_on_new_document(script.as_str()).await; + } + } + _ => { + if configuration.fingerprint { + let _ = new_page + .evaluate_on_new_document(crate::features::chrome::FP_JS) + .await; + } } - _ => (), - } - if configuration.fingerprint { - let _ = new_page - .evaluate_on_new_document( - crate::features::chrome::FP_JS, - ) - .await; } let new_page = diff --git a/spider/src/website.rs b/spider/src/website.rs index a1020242b..c090778ff 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -1452,10 +1452,26 @@ impl Website { } } - if self.configuration.fingerprint { - let _ = chrome_page - .evaluate_on_new_document(crate::features::chrome::FP_JS) - .await; + match self.configuration.evaluate_on_new_document { + Some(ref script) => { + if self.configuration.fingerprint { + let _ = chrome_page + .evaluate_on_new_document(string_concat!( + crate::features::chrome::FP_JS, + script.as_str() + )) + .await; + } else { + let _ = chrome_page.evaluate_on_new_document(script.as_str()).await; + } + } + _ => { + if self.configuration.fingerprint { + let _ = chrome_page + .evaluate_on_new_document(crate::features::chrome::FP_JS) + .await; + } + } } let _ = self.setup_chrome_interception(&chrome_page).await; @@ -2323,19 +2339,6 @@ impl Website { _ => None, }; - match self.configuration.evaluate_on_new_document { - Some(ref script) => { - let _ = new_page.evaluate_on_new_document(script.as_str()).await; - } - _ => (), - } - - if self.configuration.fingerprint { - let _ = new_page - .evaluate_on_new_document(crate::features::chrome::FP_JS) - .await; - } - if match self.configuration.budget { Some(ref b) => match b.get(&*WILD_CARD_PATH) { Some(b) => b.eq(&1), @@ -3315,16 +3318,28 @@ impl Website { Ok(new_page) => { match self.configuration.evaluate_on_new_document { Some(ref script) => { - let _ = new_page - .evaluate_on_new_document(script.as_str()) - .await; + if self.configuration.fingerprint { + let _ = new_page + .evaluate_on_new_document(string_concat!( + crate::features::chrome::FP_JS, + script.as_str() + )) + .await; + } else { + let _ = new_page + .evaluate_on_new_document(script.as_str()) + .await; + } + } + _ => { + if self.configuration.fingerprint { + let _ = new_page + .evaluate_on_new_document( + crate::features::chrome::FP_JS, + ) + .await; + } } - _ => (), - } - if self.configuration.fingerprint { - let _ = new_page - .evaluate_on_new_document(crate::features::chrome::FP_JS) - .await; } let mut q = match &self.channel_queue { @@ -3414,21 +3429,25 @@ impl Website { Ok(new_page) => { match shared.5.evaluate_on_new_document { Some(ref script) => { - let _ = new_page - .evaluate_on_new_document( - script.as_str(), - ) - .await; + if shared.5.fingerprint { + let _ = new_page + .evaluate_on_new_document(string_concat!( + crate::features::chrome::FP_JS, + script.as_str() + )) + .await; + } else { + let _ = + new_page.evaluate_on_new_document(script.as_str()).await; + } + } + _ => { + if shared.5.fingerprint { + let _ = new_page + .evaluate_on_new_document(crate::features::chrome::FP_JS) + .await; + } } - _ => (), - } - - if shared.5.fingerprint { - let _ = new_page - .evaluate_on_new_document( - crate::features::chrome::FP_JS, - ) - .await; } let new_page = diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 186015fe0..f5208ce86 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.99.14" +version = "1.99.16" authors = [ "j-mendez " ] @@ -28,7 +28,7 @@ quote = "1" failure_derive = "0.1.8" [dependencies.spider] -version = "1.99.14" +version = "1.99.16" path = "../spider" [[bin]] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index dae0d8956..70c417d9a 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -17,7 +17,7 @@ edition = "2018" indexmap = { version = "1", optional = true } [dependencies.spider] -version = "1.99.14" +version = "1.99.16" path = "../spider" [features] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index cd90e9e4b..66129373d 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.99.14" +version = "1.99.16" authors = [ "j-mendez " ] @@ -24,7 +24,7 @@ lazy_static = "1.4.0" env_logger = "0.11.3" [dependencies.spider] -version = "1.99.14" +version = "1.99.16" path = "../spider" features = ["serde", "flexbuffers"]