From ba48095a61e89c6c59aae674599ccab8c2751644 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Thu, 8 Aug 2024 08:29:20 -0400 Subject: [PATCH] feat(chrome): add trie for execution_maps and web_automation_maps --- Cargo.lock | 6 +- examples/chrome_web_automation.rs | 2 +- spider/Cargo.toml | 2 +- spider/README.md | 24 ++-- spider/src/configuration.rs | 49 +++++--- spider/src/features/chrome.rs | 14 ++- spider/src/features/chrome_common.rs | 67 ++++++++--- spider/src/features/openai_common.rs | 10 +- spider/src/page.rs | 6 +- spider/src/utils/mod.rs | 15 ++- spider/src/utils/trie.rs | 166 +++++++++++++++++++++++++++ spider/src/website.rs | 35 ++++-- spider_cli/Cargo.toml | 4 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 4 +- 15 files changed, 334 insertions(+), 72 deletions(-) create mode 100644 spider/src/utils/trie.rs diff --git a/Cargo.lock b/Cargo.lock index d31f1a731..205c1fd7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3495,7 +3495,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.99.24" +version = "1.99.30" dependencies = [ "adblock", "ahash", @@ -3550,7 +3550,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.99.24" +version = "1.99.30" dependencies = [ "clap", "env_logger", @@ -3581,7 +3581,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.99.24" +version = "1.99.30" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/chrome_web_automation.rs b/examples/chrome_web_automation.rs index c667efab0..77725e03d 100644 --- a/examples/chrome_web_automation.rs +++ b/examples/chrome_web_automation.rs @@ -13,7 +13,7 @@ async fn main() { let mut automation_scripts = HashMap::new(); automation_scripts.insert( - "https://rsseau.fr/en/blog".into(), + "/en/blog".into(), Vec::from([ WebAutomation::Evaluate(r#"document.body.style.background = "blue";"#.into()), WebAutomation::ScrollY(2000), diff --git a/spider/Cargo.toml b/spider/Cargo.toml index d630f6fa3..c78e592dc 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.99.24" +version = "1.99.30" authors = [ "j-mendez " ] diff --git a/spider/README.md b/spider/README.md index cbee9cf51..f28be69c3 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.99.24" +spider = "1.99.30" ``` And then the code: @@ -93,7 +93,7 @@ We have the following optional feature flags. ```toml [dependencies] -spider = { version = "1.99.24", features = ["regex", "ua_generator"] } +spider = { version = "1.99.30", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -139,7 +139,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.99.24", features = ["decentralized"] } +spider = { version = "1.99.30", features = ["decentralized"] } ``` ```sh @@ -170,7 +170,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.99.24", features = ["sync"] } +spider = { version = "1.99.30", features = ["sync"] } ``` ```rust,no_run @@ -201,7 +201,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.99.24", features = ["regex"] } +spider = { version = "1.99.30", features = ["regex"] } ``` ```rust,no_run @@ -228,7 +228,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.99.24", features = ["control"] } +spider = { version = "1.99.30", features = ["control"] } ``` ```rust @@ -298,7 +298,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "1.99.24", features = ["sync", "cron"] } +spider = { version = "1.99.30", features = ["sync", "cron"] } ``` ```rust,no_run @@ -337,7 +337,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network ```toml [dependencies] -spider = { version = "1.99.24", features = ["chrome", "chrome_intercept"] } +spider = { version = "1.99.30", features = ["chrome", "chrome_intercept"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. @@ -367,7 +367,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`] ```toml [dependencies] -spider = { version = "1.99.24", features = ["cache"] } +spider = { version = "1.99.30", features = ["cache"] } ``` You need to set `website.cache` to true to enable as well. @@ -398,7 +398,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be ```toml [dependencies] -spider = { version = "1.99.24", features = ["smart"] } +spider = { version = "1.99.30", features = ["smart"] } ``` ```rust,no_run @@ -424,7 +424,7 @@ Use OpenAI to generate dynamic scripts to drive the browser done with the featur ```toml [dependencies] -spider = { version = "1.99.24", features = ["openai"] } +spider = { version = "1.99.30", features = ["openai"] } ``` ```rust @@ -450,7 +450,7 @@ Set a depth limit to prevent forwarding. ```toml [dependencies] -spider = { version = "1.99.24" } +spider = { version = "1.99.30" } ``` ```rust,no_run diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs index bfbae7457..1b26b993a 100644 --- a/spider/src/configuration.rs +++ b/spider/src/configuration.rs @@ -1,16 +1,16 @@ use crate::compact_str::CompactString; pub use crate::features::chrome_common::{ - AuthChallengeResponse, AuthChallengeResponseResponse, AutomationScripts, + AuthChallengeResponse, AuthChallengeResponseResponse, AutomationScripts, AutomationScriptsMap, CaptureScreenshotFormat, CaptureScreenshotParams, ClipViewport, ExecutionScripts, - ScreenShotConfig, ScreenshotParams, Viewport, WaitFor, WaitForDelay, WaitForIdleNetwork, - WaitForSelector, WebAutomation, + ExecutionScriptsMap, ScreenShotConfig, ScreenshotParams, Viewport, WaitFor, WaitForDelay, + WaitForIdleNetwork, WaitForSelector, WebAutomation, }; pub use crate::features::openai_common::GPTConfigs; use crate::website::CronType; use std::time::Duration; /// Redirect policy configuration for request -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum RedirectPolicy { #[default] @@ -36,6 +36,15 @@ type AllowList = Box; /// website.configuration.tld = true; /// ``` #[derive(Debug, Default, Clone)] +#[cfg_attr( + all( + not(feature = "regex"), + not(feature = "openai"), + not(feature = "cache_openai") + ), + derive(PartialEq) +)] + pub struct Configuration { /// Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included. pub respect_robots_txt: bool, @@ -93,7 +102,7 @@ pub struct Configuration { pub chrome_intercept: bool, /// Configure the viewport for chrome. This does nothing without the flag `chrome` enabled. #[cfg(feature = "chrome")] - pub viewport: Option, + pub viewport: Option, /// Block all images from rendering in Chrome. This does nothing without the flag `chrome_intercept` enabled #[cfg(feature = "chrome")] pub chrome_intercept_block_visuals: bool, @@ -135,10 +144,10 @@ pub struct Configuration { pub chrome_connection_url: Option, /// Scripts to execute for individual pages, the full path of the url is required for an exact match. This is useful for running one off JS on pages like performing custom login actions. #[cfg(feature = "chrome")] - pub execution_scripts: ExecutionScripts, + pub execution_scripts: Option, /// Web automation scripts to run up to a duration of 60 seconds. #[cfg(feature = "chrome")] - pub automation_scripts: AutomationScripts, + pub automation_scripts: Option, /// Use a shared queue strategy when crawling. This can scale workloads evenly that do not need priority. pub shared_queue: bool, /// The blacklist urls. @@ -668,27 +677,41 @@ impl Configuration { #[cfg(not(feature = "chrome"))] /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled. - pub fn with_execution_scripts(&mut self, _execution_scripts: ExecutionScripts) -> &mut Self { + pub fn with_execution_scripts( + &mut self, + _execution_scripts: Option, + ) -> &mut Self { self } #[cfg(feature = "chrome")] /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled. - pub fn with_execution_scripts(&mut self, execution_scripts: ExecutionScripts) -> &mut Self { - self.execution_scripts = execution_scripts; + pub fn with_execution_scripts( + &mut self, + execution_scripts: Option, + ) -> &mut Self { + self.execution_scripts = + crate::features::chrome_common::convert_to_trie_execution_scripts(&execution_scripts); self } #[cfg(not(feature = "chrome"))] /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled. - pub fn with_automation_scripts(&mut self, _automation_scripts: AutomationScripts) -> &mut Self { + pub fn with_automation_scripts( + &mut self, + _automation_scripts: Option, + ) -> &mut Self { self } #[cfg(feature = "chrome")] /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled. - pub fn with_automation_scripts(&mut self, automation_scripts: AutomationScripts) -> &mut Self { - self.automation_scripts = automation_scripts; + pub fn with_automation_scripts( + &mut self, + automation_scripts: Option, + ) -> &mut Self { + self.automation_scripts = + crate::features::chrome_common::convert_to_trie_automation_scripts(&automation_scripts); self } diff --git a/spider/src/features/chrome.rs b/spider/src/features/chrome.rs index f3fc60823..ffcf49a10 100644 --- a/spider/src/features/chrome.rs +++ b/spider/src/features/chrome.rs @@ -133,7 +133,12 @@ fn create_handler_config(config: &Configuration) -> HandlerConfig { }, request_intercept: cfg!(feature = "chrome_intercept") && config.chrome_intercept, cache_enabled: config.cache, - viewport: config.viewport.clone(), + viewport: match config.viewport { + Some(ref v) => Some(chromiumoxide::handler::viewport::Viewport::from( + v.to_owned(), + )), + _ => None, + }, ..HandlerConfig::default() } } @@ -161,7 +166,12 @@ pub async fn setup_browser_configuration( &proxies, config.chrome_intercept, config.cache, - config.viewport.clone(), + match config.viewport { + Some(ref v) => Some(chromiumoxide::handler::viewport::Viewport::from( + v.to_owned(), + )), + _ => None, + }, &config.request_timeout, ) { Some(browser_config) => match Browser::launch(browser_config).await { diff --git a/spider/src/features/chrome_common.rs b/spider/src/features/chrome_common.rs index 51f6e7235..989fbf71f 100644 --- a/spider/src/features/chrome_common.rs +++ b/spider/src/features/chrome_common.rs @@ -1,4 +1,6 @@ -#[derive(Debug, Default, Clone)] +use crate::utils::trie::Trie; + +#[derive(Debug, Default, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] /// Wait for network request with optional timeout. This does nothing without the `chrome` flag enabled. pub struct WaitForIdleNetwork { @@ -13,7 +15,7 @@ impl WaitForIdleNetwork { } } -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] /// Wait for a selector with optional timeout. This does nothing without the `chrome` flag enabled. pub struct WaitForSelector { @@ -30,7 +32,7 @@ impl WaitForSelector { } } -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] /// Wait for with a delay. Should only be used for testing purposes. This does nothing without the `chrome` flag enabled. pub struct WaitForDelay { @@ -45,7 +47,7 @@ impl WaitForDelay { } } -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] /// The wait for options for the page. Multiple options can be set. This does nothing without the `chrome` flag enabled. pub struct WaitFor { @@ -130,7 +132,7 @@ impl From } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] /// View port handling for chrome. pub struct Viewport { @@ -203,7 +205,7 @@ impl From for chromiumoxide::handler::viewport::Viewport { } #[doc = "Capture page screenshot.\n[captureScreenshot](https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-captureScreenshot)"] -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, Default, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct CaptureScreenshotParams { #[doc = "Image compression format (defaults to png)."] @@ -253,7 +255,7 @@ impl From for chromiumoxide::cdp::browser_protocol::page::Viewport } /// Screenshot configuration. -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct ScreenShotConfig { /// The screenshot params. @@ -284,7 +286,7 @@ impl ScreenShotConfig { } /// The screenshot params for the page. -#[derive(Default, Debug, Clone)] +#[derive(Default, Debug, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct ScreenshotParams { /// Chrome DevTools Protocol screenshot options. @@ -566,19 +568,56 @@ pub fn set_dynamic_scroll(timeout: u32) -> String { } /// Execution scripts to run on the page when using chrome by url. -pub type ExecutionScripts = Option>; +pub type ExecutionScriptsMap = hashbrown::HashMap; +/// Automation scripts to run on the page when using chrome by url. +pub type AutomationScriptsMap = hashbrown::HashMap>; + +/// Execution scripts to run on the page when using chrome by url. +pub type ExecutionScripts = Trie; /// Automation scripts to run on the page when using chrome by url. -pub type AutomationScripts = Option>>; +pub type AutomationScripts = Trie>; + +/// Convert ExecutionScripts to Trie. +pub fn convert_to_trie_execution_scripts( + input: &Option, +) -> Option> { + match input { + Some(ref scripts) => { + let mut trie = Trie::new(); + for (path, script) in scripts { + trie.insert(path, script.clone()); + } + Some(trie) + } + None => None, + } +} + +/// Convert AutomationScripts to Trie. +pub fn convert_to_trie_automation_scripts( + input: &Option, +) -> Option>> { + match input { + Some(ref scripts) => { + let mut trie = Trie::new(); + for (path, script_list) in scripts { + trie.insert(path, script_list.clone()); + } + Some(trie) + } + None => None, + } +} /// Eval execution scripts. #[cfg(feature = "chrome")] pub async fn eval_execution_scripts( page: &chromiumoxide::Page, target_url: &str, - execution_scripts: &ExecutionScripts, + execution_scripts: &Option, ) { match execution_scripts { - Some(ref scripts) => match scripts.get(target_url) { + Some(ref scripts) => match scripts.search(target_url) { Some(script) => { let _ = page.evaluate(script.as_str()).await; } @@ -593,10 +632,10 @@ pub async fn eval_execution_scripts( pub async fn eval_automation_scripts( page: &chromiumoxide::Page, target_url: &str, - automation_scripts: &AutomationScripts, + automation_scripts: &Option, ) { if let Some(script_map) = automation_scripts { - if let Some(scripts) = script_map.get(target_url) { + if let Some(scripts) = script_map.search(target_url) { for script in scripts { let result = tokio::time::timeout(tokio::time::Duration::from_secs(60), script.run(page)) diff --git a/spider/src/features/openai_common.rs b/spider/src/features/openai_common.rs index 78be120e4..1b8cff49d 100644 --- a/spider/src/features/openai_common.rs +++ b/spider/src/features/openai_common.rs @@ -1,5 +1,5 @@ /// The type of prompt to use. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize))] pub enum Prompt { /// A single prompt to run. @@ -67,6 +67,14 @@ impl Default for Prompt { /// The GPT configs to use for dynamic Javascript execution and other functionality. #[derive(Debug, Default, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[cfg_attr( + all( + not(feature = "regex"), + not(feature = "openai"), + not(feature = "cache_openai") + ), + derive(PartialEq) +)] pub struct GPTConfigs { /// The prompt to use for the Chat. Example: Search for movies. This will attempt to get the code required to perform the action on the page. pub prompt: Prompt, diff --git a/spider/src/page.rs b/spider/src/page.rs index fdb0d1eb1..b640f0eba 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -335,8 +335,8 @@ impl Page { screenshot: &Option, page_set: bool, openai_config: &Option, - execution_scripts: &ExecutionScripts, - automation_scripts: &AutomationScripts, + execution_scripts: &Option, + automation_scripts: &Option, ) -> Self { let page_resource = crate::utils::fetch_page_html( &url, @@ -1402,7 +1402,7 @@ impl Page { #[cfg(test)] #[cfg(all(not(feature = "decentralized"), not(feature = "cache")))] -const TEST_AGENT_NAME: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION")); +pub const TEST_AGENT_NAME: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION")); #[cfg(all( feature = "headers", diff --git a/spider/src/utils/mod.rs b/spider/src/utils/mod.rs index a7ee5164c..e7cfe97be 100644 --- a/spider/src/utils/mod.rs +++ b/spider/src/utils/mod.rs @@ -1,5 +1,8 @@ /// Utils to modify the HTTP header. pub mod header_utils; +/// A trie struct. +pub mod trie; + #[cfg(feature = "chrome")] use crate::features::chrome_common::{AutomationScripts, ExecutionScripts}; use crate::tokio_stream::StreamExt; @@ -803,8 +806,8 @@ pub async fn fetch_page_html_chrome_base( page_set: bool, openai_config: &Option, url_target: Option<&str>, - execution_scripts: &ExecutionScripts, - automation_scripts: &AutomationScripts, + execution_scripts: &Option, + automation_scripts: &Option, ) -> Result { let mut chrome_http_req_res = ChromeHTTPReqRes::default(); @@ -1433,8 +1436,8 @@ pub async fn fetch_page_html( screenshot: &Option, page_set: bool, openai_config: &Option, - execution_scripts: &ExecutionScripts, - automation_scripts: &AutomationScripts, + execution_scripts: &Option, + automation_scripts: &Option, ) -> PageResponse { match fetch_page_html_chrome_base( &target_url, @@ -1469,8 +1472,8 @@ pub async fn fetch_page_html_chrome( screenshot: &Option, page_set: bool, openai_config: &Option, - execution_scripts: &ExecutionScripts, - automation_scripts: &AutomationScripts, + execution_scripts: &Option, + automation_scripts: &Option, ) -> PageResponse { match &page { page => { diff --git a/spider/src/utils/trie.rs b/spider/src/utils/trie.rs new file mode 100644 index 000000000..2f08bf4d9 --- /dev/null +++ b/spider/src/utils/trie.rs @@ -0,0 +1,166 @@ +use hashbrown::HashMap; +use std::fmt::Debug; + +#[derive(Debug, Clone, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +/// TrieNode structure to handle clean url path mappings. +pub struct TrieNode { + /// The children for the trie. + pub children: HashMap>, + /// The value for the trie. + pub value: Option, +} + +impl TrieNode { + /// A new trie node. + pub fn new() -> Self { + TrieNode { + children: HashMap::new(), + value: None, + } + } +} + +#[derive(Debug, Clone, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +/// Trie value. +pub struct Trie { + /// A new trie node. + pub root: TrieNode, +} + +impl Trie { + /// A new trie node. + pub fn new() -> Self { + Self { + root: TrieNode::new(), + } + } + + /// Normalize a url. This will perform a match against paths across all domains. + fn normalize_path(path: &str) -> String { + let start_pos = if let Some(pos) = path.find("://") { + if pos + 3 < path.len() { + path[pos + 3..] + .find('/') + .map_or(path.len(), |p| pos + 3 + p) + } else { + 0 + } + } else { + 0 + }; + + let base_path = if start_pos < path.len() { + &path[start_pos..] + } else { + path + }; + + let normalized_path = base_path + .split('/') + .filter(|segment| !segment.is_empty() && !segment.contains('.')) + .collect::>() + .join("/"); + + string_concat!("/", normalized_path) + } + + /// Insert a path and its associated value into the trie. + pub fn insert(&mut self, path: &str, value: V) { + let normalized_path = Self::normalize_path(path); + let mut node = &mut self.root; + + let segments: Vec<&str> = normalized_path + .split('/') + .filter(|s| !s.is_empty()) + .collect(); + + for segment in segments { + node = node + .children + .entry(segment.to_string()) + .or_insert_with(TrieNode::new); + } + + node.value = Some(value); + } + + /// Search for a path in the trie. + pub fn search(&self, input: &str) -> Option<&V> { + let normalized_path = Self::normalize_path(input); + let mut node = &self.root; + + for segment in normalized_path.split('/').filter(|s| !s.is_empty()) { + if let Some(child) = node.children.get(segment) { + node = child; + } else { + return None; + } + } + + node.value.as_ref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_trie_node_new() { + let node: TrieNode = TrieNode::new(); + assert!(node.children.is_empty()); + assert!(node.value.is_none()); + } + + #[test] + fn test_trie_new() { + let trie: Trie = Trie::new(); + assert!(trie.root.children.is_empty()); + assert!(trie.root.value.is_none()); + } + + #[test] + fn test_insert_and_search() { + let mut trie: Trie = Trie::new(); + trie.insert("/path/to/node", 42); + trie.insert("https://mywebsite/path/to/node", 42); + + assert_eq!(trie.search("https://mywebsite/path/to/node"), Some(&42)); + assert_eq!(trie.search("/path/to/node"), Some(&42)); + assert_eq!(trie.search("/path"), None); + assert_eq!(trie.search("/path/to"), None); + assert_eq!(trie.search("/path/to/node/extra"), None); + } + + #[test] + fn test_insert_multiple_nodes() { + let mut trie: Trie = Trie::new(); + trie.insert("/path/to/node1", 1); + trie.insert("/path/to/node2", 2); + trie.insert("/path/to/node3", 3); + + assert_eq!(trie.search("/path/to/node1"), Some(&1)); + assert_eq!(trie.search("/path/to/node2"), Some(&2)); + assert_eq!(trie.search("/path/to/node3"), Some(&3)); + } + + #[test] + fn test_insert_overwrite() { + let mut trie: Trie = Trie::new(); + trie.insert("/path/to/node", 42); + trie.insert("/path/to/node", 84); + + assert_eq!(trie.search("/path/to/node"), Some(&84)); + } + + #[test] + fn test_search_nonexistent_path() { + let mut trie: Trie = Trie::new(); + trie.insert("/path/to/node", 42); + + assert!(trie.search("/nonexistent").is_none()); + assert!(trie.search("/path/to/wrongnode").is_none()); + } +} diff --git a/spider/src/website.rs b/spider/src/website.rs index cd6fc7c12..be1186715 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -1,6 +1,8 @@ use crate::black_list::contains; use crate::compact_str::CompactString; -use crate::configuration::{self, get_ua, Configuration, RedirectPolicy, WebAutomation}; +use crate::configuration::{ + self, get_ua, AutomationScriptsMap, Configuration, ExecutionScriptsMap, RedirectPolicy, +}; use crate::packages::robotparser::parser::RobotFileParser; use crate::page::{build, get_page_selectors, Page}; use crate::utils::log; @@ -4810,7 +4812,7 @@ impl Website { /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled. pub fn with_execution_scripts( &mut self, - _execution_scripts: Option>, + _execution_scripts: Option, ) -> &mut Self { self } @@ -4819,7 +4821,7 @@ impl Website { /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled. pub fn with_execution_scripts( &mut self, - execution_scripts: Option>, + execution_scripts: Option, ) -> &mut Self { self.configuration.with_execution_scripts(execution_scripts); self @@ -4829,7 +4831,7 @@ impl Website { /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled. pub fn with_automation_scripts( &mut self, - _automation_scripts: Option>, + _automation_scripts: Option, ) -> &mut Self { self } @@ -4838,7 +4840,7 @@ impl Website { /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled. pub fn with_automation_scripts( &mut self, - automation_scripts: Option>>, + automation_scripts: Option, ) -> &mut Self { self.configuration .with_automation_scripts(automation_scripts); @@ -5406,6 +5408,7 @@ async fn test_crawl_subdomains() { } #[tokio::test] +#[cfg(all(not(feature = "regex"), not(feature = "openai")))] async fn test_with_configuration() { let mut website = Website::new("https://choosealicense.com"); @@ -5416,17 +5419,27 @@ async fn test_with_configuration() { .with_delay(0) .with_request_timeout(None) .with_http2_prior_knowledge(false) - .with_ignore_sitemap(true) - .with_user_agent(Some("myapp/version")) + .with_user_agent(Some(crate::page::TEST_AGENT_NAME)) .with_headers(None) .with_proxies(None); - website.crawl().await; + let mut configuration = Box::new(configuration::Configuration::new()); + + configuration.respect_robots_txt = true; + configuration.subdomains = true; + configuration.tld = false; + configuration.delay = 0; + configuration.request_timeout = None; + configuration.http2_prior_knowledge = false; + configuration.user_agent = Some(Box::new(CompactString::new(crate::page::TEST_AGENT_NAME))); + configuration.headers = None; + configuration.proxies = None; assert!( - website.links_visited.len() >= 1, - "{:?}", - website.links_visited + website.configuration == configuration, + "Left\n{:?}\n\nRight\n{:?}", + website.configuration, + configuration ); } diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index dee4ae77a..956a7a6c1 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.99.24" +version = "1.99.30" authors = [ "j-mendez " ] @@ -28,7 +28,7 @@ quote = "1" failure_derive = "0.1.8" [dependencies.spider] -version = "1.99.24" +version = "1.99.30" path = "../spider" [[bin]] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 06ebea659..45f28f6f6 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -17,7 +17,7 @@ edition = "2018" indexmap = { version = "1", optional = true } [dependencies.spider] -version = "1.99.24" +version = "1.99.30" path = "../spider" [features] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 13175cff8..39e831a0a 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.99.24" +version = "1.99.30" authors = [ "j-mendez " ] @@ -24,7 +24,7 @@ lazy_static = "1.4.0" env_logger = "0.11.3" [dependencies.spider] -version = "1.99.24" +version = "1.99.30" path = "../spider" features = ["serde", "flexbuffers"]