Skip to content

Commit

Permalink
feat(chrome): add trie for execution_maps and web_automation_maps
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Aug 8, 2024
1 parent aacbbce commit ba48095
Show file tree
Hide file tree
Showing 15 changed files with 334 additions and 72 deletions.
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion examples/chrome_web_automation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ async fn main() {
let mut automation_scripts = HashMap::new();

automation_scripts.insert(
"https://rsseau.fr/en/blog".into(),
"/en/blog".into(),
Vec::from([
WebAutomation::Evaluate(r#"document.body.style.background = "blue";"#.into()),
WebAutomation::ScrollY(2000),
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.99.24"
version = "1.99.30"
authors = [
"j-mendez <jeff@a11ywatch.com>"
]
Expand Down
24 changes: 12 additions & 12 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom

```toml
[dependencies]
spider = "1.99.24"
spider = "1.99.30"
```

And then the code:
Expand Down Expand Up @@ -93,7 +93,7 @@ We have the following optional feature flags.

```toml
[dependencies]
spider = { version = "1.99.24", features = ["regex", "ua_generator"] }
spider = { version = "1.99.30", features = ["regex", "ua_generator"] }
```

1. `ua_generator`: Enables auto generating a random real User-Agent.
Expand Down Expand Up @@ -139,7 +139,7 @@ Move processing to a worker, drastically increases performance even if worker is

```toml
[dependencies]
spider = { version = "1.99.24", features = ["decentralized"] }
spider = { version = "1.99.30", features = ["decentralized"] }
```

```sh
Expand Down Expand Up @@ -170,7 +170,7 @@ Use the subscribe method to get a broadcast channel.

```toml
[dependencies]
spider = { version = "1.99.24", features = ["sync"] }
spider = { version = "1.99.30", features = ["sync"] }
```

```rust,no_run
Expand Down Expand Up @@ -201,7 +201,7 @@ Allow regex for blacklisting routes

```toml
[dependencies]
spider = { version = "1.99.24", features = ["regex"] }
spider = { version = "1.99.30", features = ["regex"] }
```

```rust,no_run
Expand All @@ -228,7 +228,7 @@ If you are performing large workloads you may need to control the crawler by ena

```toml
[dependencies]
spider = { version = "1.99.24", features = ["control"] }
spider = { version = "1.99.30", features = ["control"] }
```

```rust
Expand Down Expand Up @@ -298,7 +298,7 @@ Use cron jobs to run crawls continuously at anytime.

```toml
[dependencies]
spider = { version = "1.99.24", features = ["sync", "cron"] }
spider = { version = "1.99.30", features = ["sync", "cron"] }
```

```rust,no_run
Expand Down Expand Up @@ -337,7 +337,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network

```toml
[dependencies]
spider = { version = "1.99.24", features = ["chrome", "chrome_intercept"] }
spider = { version = "1.99.30", features = ["chrome", "chrome_intercept"] }
```

You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.
Expand Down Expand Up @@ -367,7 +367,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`]

```toml
[dependencies]
spider = { version = "1.99.24", features = ["cache"] }
spider = { version = "1.99.30", features = ["cache"] }
```

You need to set `website.cache` to true to enable as well.
Expand Down Expand Up @@ -398,7 +398,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be

```toml
[dependencies]
spider = { version = "1.99.24", features = ["smart"] }
spider = { version = "1.99.30", features = ["smart"] }
```

```rust,no_run
Expand All @@ -424,7 +424,7 @@ Use OpenAI to generate dynamic scripts to drive the browser done with the featur

```toml
[dependencies]
spider = { version = "1.99.24", features = ["openai"] }
spider = { version = "1.99.30", features = ["openai"] }
```

```rust
Expand All @@ -450,7 +450,7 @@ Set a depth limit to prevent forwarding.

```toml
[dependencies]
spider = { version = "1.99.24" }
spider = { version = "1.99.30" }
```

```rust,no_run
Expand Down
49 changes: 36 additions & 13 deletions spider/src/configuration.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
use crate::compact_str::CompactString;
pub use crate::features::chrome_common::{
AuthChallengeResponse, AuthChallengeResponseResponse, AutomationScripts,
AuthChallengeResponse, AuthChallengeResponseResponse, AutomationScripts, AutomationScriptsMap,
CaptureScreenshotFormat, CaptureScreenshotParams, ClipViewport, ExecutionScripts,
ScreenShotConfig, ScreenshotParams, Viewport, WaitFor, WaitForDelay, WaitForIdleNetwork,
WaitForSelector, WebAutomation,
ExecutionScriptsMap, ScreenShotConfig, ScreenshotParams, Viewport, WaitFor, WaitForDelay,
WaitForIdleNetwork, WaitForSelector, WebAutomation,
};
pub use crate::features::openai_common::GPTConfigs;
use crate::website::CronType;
use std::time::Duration;

/// Redirect policy configuration for request
#[derive(Debug, Default, Clone)]
#[derive(Debug, Default, Clone, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum RedirectPolicy {
#[default]
Expand All @@ -36,6 +36,15 @@ type AllowList = Box<regex::RegexSet>;
/// website.configuration.tld = true;
/// ```
#[derive(Debug, Default, Clone)]
#[cfg_attr(
all(
not(feature = "regex"),
not(feature = "openai"),
not(feature = "cache_openai")
),
derive(PartialEq)
)]

pub struct Configuration {
/// Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.
pub respect_robots_txt: bool,
Expand Down Expand Up @@ -93,7 +102,7 @@ pub struct Configuration {
pub chrome_intercept: bool,
/// Configure the viewport for chrome. This does nothing without the flag `chrome` enabled.
#[cfg(feature = "chrome")]
pub viewport: Option<chromiumoxide::handler::viewport::Viewport>,
pub viewport: Option<Viewport>,
/// Block all images from rendering in Chrome. This does nothing without the flag `chrome_intercept` enabled
#[cfg(feature = "chrome")]
pub chrome_intercept_block_visuals: bool,
Expand Down Expand Up @@ -135,10 +144,10 @@ pub struct Configuration {
pub chrome_connection_url: Option<String>,
/// Scripts to execute for individual pages, the full path of the url is required for an exact match. This is useful for running one off JS on pages like performing custom login actions.
#[cfg(feature = "chrome")]
pub execution_scripts: ExecutionScripts,
pub execution_scripts: Option<ExecutionScripts>,
/// Web automation scripts to run up to a duration of 60 seconds.
#[cfg(feature = "chrome")]
pub automation_scripts: AutomationScripts,
pub automation_scripts: Option<AutomationScripts>,
/// Use a shared queue strategy when crawling. This can scale workloads evenly that do not need priority.
pub shared_queue: bool,
/// The blacklist urls.
Expand Down Expand Up @@ -668,27 +677,41 @@ impl Configuration {

#[cfg(not(feature = "chrome"))]
/// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled.
pub fn with_execution_scripts(&mut self, _execution_scripts: ExecutionScripts) -> &mut Self {
pub fn with_execution_scripts(
&mut self,
_execution_scripts: Option<ExecutionScriptsMap>,
) -> &mut Self {
self
}

#[cfg(feature = "chrome")]
/// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled.
pub fn with_execution_scripts(&mut self, execution_scripts: ExecutionScripts) -> &mut Self {
self.execution_scripts = execution_scripts;
pub fn with_execution_scripts(
&mut self,
execution_scripts: Option<ExecutionScriptsMap>,
) -> &mut Self {
self.execution_scripts =
crate::features::chrome_common::convert_to_trie_execution_scripts(&execution_scripts);
self
}

#[cfg(not(feature = "chrome"))]
/// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled.
pub fn with_automation_scripts(&mut self, _automation_scripts: AutomationScripts) -> &mut Self {
pub fn with_automation_scripts(
&mut self,
_automation_scripts: Option<AutomationScriptsMap>,
) -> &mut Self {
self
}

#[cfg(feature = "chrome")]
/// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled.
pub fn with_automation_scripts(&mut self, automation_scripts: AutomationScripts) -> &mut Self {
self.automation_scripts = automation_scripts;
pub fn with_automation_scripts(
&mut self,
automation_scripts: Option<AutomationScriptsMap>,
) -> &mut Self {
self.automation_scripts =
crate::features::chrome_common::convert_to_trie_automation_scripts(&automation_scripts);
self
}

Expand Down
14 changes: 12 additions & 2 deletions spider/src/features/chrome.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,12 @@ fn create_handler_config(config: &Configuration) -> HandlerConfig {
},
request_intercept: cfg!(feature = "chrome_intercept") && config.chrome_intercept,
cache_enabled: config.cache,
viewport: config.viewport.clone(),
viewport: match config.viewport {
Some(ref v) => Some(chromiumoxide::handler::viewport::Viewport::from(
v.to_owned(),
)),
_ => None,
},
..HandlerConfig::default()
}
}
Expand Down Expand Up @@ -161,7 +166,12 @@ pub async fn setup_browser_configuration(
&proxies,
config.chrome_intercept,
config.cache,
config.viewport.clone(),
match config.viewport {
Some(ref v) => Some(chromiumoxide::handler::viewport::Viewport::from(
v.to_owned(),
)),
_ => None,
},
&config.request_timeout,
) {
Some(browser_config) => match Browser::launch(browser_config).await {
Expand Down
Loading

0 comments on commit ba48095

Please sign in to comment.