Skip to content

Commit

Permalink
feat(chrome): add wait_for configuration delay, selector, and idle ne…
Browse files Browse the repository at this point in the history
…twork
  • Loading branch information
j-mendez committed Feb 21, 2024
1 parent 042c819 commit 7932600
Show file tree
Hide file tree
Showing 10 changed files with 298 additions and 96 deletions.
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.82.7"
version = "1.83.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ htr = "0.5.27"
flexbuffers = "2.0.0"

[dependencies.spider]
version = "1.82.7"
version = "1.83.0"
path = "../spider"
features = ["serde"]

Expand Down
39 changes: 31 additions & 8 deletions spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
[package]
name = "spider"
version = "1.82.7"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
version = "1.83.0"
authors = [
"madeindjs <contact@rousseau-alexandre.fr>",
"j-mendez <jeff@a11ywatch.com>",
]
description = "The fastest web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
readme = "README.md"
Expand All @@ -15,9 +18,19 @@ edition = "2018"
maintenance = { status = "as-is" }

[dependencies]
reqwest = { version = "0.11.23", features = [ "brotli", "gzip", "deflate", "stream" ] }
reqwest = { version = "0.11.23", features = [
"brotli",
"gzip",
"deflate",
"stream",
] }
url = "2.5.0"
tokio = { version = "1.36.0", features = [ "rt-multi-thread", "macros", "time", "parking_lot" ] }
tokio = { version = "1.36.0", features = [
"rt-multi-thread",
"macros",
"time",
"parking_lot",
] }
tokio-stream = "0.1.14"
hashbrown = { version = "0.14.3", default-features = true }
log = "0.4.20"
Expand All @@ -39,14 +52,24 @@ bytes = { version = "1.5.0", features = ["serde"] }
serde = { version = "1.0.193", optional = true, features = ["derive"] }
flexbuffers = { version = "2.0.0", optional = true }
itertools = { version = "0.12.0", optional = true }
case_insensitive_string = { version = "0.2.2", features = [ "compact", "serde" ]}
jsdom = { version = "0.0.11-alpha.1", optional = true, features = [ "hashbrown", "tokio" ] }
chromiumoxide = { version = "0.5.7", optional = true, features = ["tokio-runtime", "bytes"], default-features = false }
case_insensitive_string = { version = "0.2.2", features = ["compact", "serde"] }
jsdom = { version = "0.0.11-alpha.1", optional = true, features = [
"hashbrown",
"tokio",
] }
chromiumoxide = { version = "0.5.7", optional = true, features = [
"tokio-runtime",
"bytes",
], default-features = false }
sitemap = { version = "0.4.1", optional = true }
chrono = { version = "0.4.31", optional = true }
cron = { version = "0.12.0", optional = true }
async-trait = { version = "0.1.75", optional = true }
napi = { version = "2", features = ["async", "tokio_rt", "napi4"], optional = true }
napi = { version = "2", features = [
"async",
"tokio_rt",
"napi4",
], optional = true }
strum = { version = "0.25", features = ["derive"] }
async_job = { version = "0.1.4", optional = true }
reqwest-middleware = { version = "0.2.4", optional = true }
Expand Down
22 changes: 11 additions & 11 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom

```toml
[dependencies]
spider = "1.82.7"
spider = "1.83.0"
```

And then the code:
Expand Down Expand Up @@ -93,7 +93,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl

```toml
[dependencies]
spider = { version = "1.82.7", features = ["regex", "ua_generator"] }
spider = { version = "1.83.0", features = ["regex", "ua_generator"] }
```

1. `ua_generator`: Enables auto generating a random real User-Agent.
Expand Down Expand Up @@ -134,7 +134,7 @@ Move processing to a worker, drastically increases performance even if worker is

```toml
[dependencies]
spider = { version = "1.82.7", features = ["decentralized"] }
spider = { version = "1.83.0", features = ["decentralized"] }
```

```sh
Expand Down Expand Up @@ -165,7 +165,7 @@ Use the subscribe method to get a broadcast channel.

```toml
[dependencies]
spider = { version = "1.82.7", features = ["sync"] }
spider = { version = "1.83.0", features = ["sync"] }
```

```rust,no_run
Expand Down Expand Up @@ -195,7 +195,7 @@ Allow regex for blacklisting routes

```toml
[dependencies]
spider = { version = "1.82.7", features = ["regex"] }
spider = { version = "1.83.0", features = ["regex"] }
```

```rust,no_run
Expand All @@ -222,7 +222,7 @@ If you are performing large workloads you may need to control the crawler by ena

```toml
[dependencies]
spider = { version = "1.82.7", features = ["control"] }
spider = { version = "1.83.0", features = ["control"] }
```

```rust
Expand Down Expand Up @@ -292,7 +292,7 @@ Use cron jobs to run crawls continuously at anytime.

```toml
[dependencies]
spider = { version = "1.82.7", features = ["sync", "cron"] }
spider = { version = "1.83.0", features = ["sync", "cron"] }
```

```rust,no_run
Expand Down Expand Up @@ -331,7 +331,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network

```toml
[dependencies]
spider = { version = "1.82.7", features = ["chrome", "chrome_intercept"] }
spider = { version = "1.83.0", features = ["chrome", "chrome_intercept"] }
```

You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.
Expand Down Expand Up @@ -361,7 +361,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`]

```toml
[dependencies]
spider = { version = "1.82.7", features = ["cache"] }
spider = { version = "1.83.0", features = ["cache"] }
```

You need to set `website.cache` to true to enable as well.
Expand Down Expand Up @@ -392,7 +392,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be

```toml
[dependencies]
spider = { version = "1.82.7", features = ["smart"] }
spider = { version = "1.83.0", features = ["smart"] }
```

```rust,no_run
Expand All @@ -418,7 +418,7 @@ Set a depth limit to prevent forwarding.

```toml
[dependencies]
spider = { version = "1.82.7", features = ["budget"] }
spider = { version = "1.83.0", features = ["budget"] }
```

```rust,no_run
Expand Down
128 changes: 125 additions & 3 deletions spider/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,75 @@ impl WaitForIdleNetwork {
}
}

#[derive(Debug, Default, Clone)]
/// Wait for a selector with optional timeout. This does nothing without the `chrome` flag enabled.
pub struct WaitForSelector {
/// The max time to wait for the selector. It is recommended to set this to a value around 30s. Set the value to None to remove the timeout.
pub timeout: Option<core::time::Duration>,
/// The selector wait for
pub selector: String,
}

impl WaitForSelector {
/// Create new WaitForSelector with timeout.
pub fn new(timeout: Option<core::time::Duration>, selector: String) -> Self {
Self { timeout, selector }
}
}

#[derive(Debug, Default, Clone)]
/// Wait for with a delay. Should only be used for testing purposes. This does nothing without the `chrome` flag enabled.
pub struct WaitForDelay {
/// The max time to wait. It is recommended to set this to a value around 30s. Set the value to None to remove the timeout.
pub timeout: Option<core::time::Duration>,
}

impl WaitForDelay {
/// Create new WaitForDelay with timeout.
pub fn new(timeout: Option<core::time::Duration>) -> Self {
Self { timeout }
}
}

#[derive(Debug, Default, Clone)]
/// The wait for options for the page. Multiple options can be set. This does nothing without the `chrome` flag enabled.
pub struct WaitFor {
/// The max time to wait for the selector.
pub selector: Option<WaitForSelector>,
/// Wait for idle network 500ms.
pub idle_network: Option<WaitForIdleNetwork>,
/// Wait for delay. Should only be used for testing.
pub delay: Option<WaitForDelay>,
/// Wait for page navigations.
pub page_navigations: bool,
}

impl WaitFor {
/// Create new WaitFor with timeout.
pub fn new(
timeout: Option<core::time::Duration>,
delay: Option<WaitForDelay>,
page_navigations: bool,
idle_network: bool,
selector: Option<String>,
) -> Self {
Self {
page_navigations,
idle_network: if idle_network {
Some(WaitForIdleNetwork::new(timeout))
} else {
None
},
selector: if selector.is_some() {
Some(WaitForSelector::new(timeout, selector.unwrap_or_default()))
} else {
None
},
delay,
}
}
}

/// Structure to configure `Website` crawler
/// ```rust
/// use spider::website::Website;
Expand Down Expand Up @@ -115,8 +184,8 @@ pub struct Configuration {
/// Collect all the resources found on the page.
pub full_resources: bool,
#[cfg(feature = "chrome")]
/// Wait for idle network connections.
pub wait_for_idle_network: Option<WaitForIdleNetwork>,
/// Wait for options for the page.
pub wait_for: Option<WaitFor>,
/// Dangerously accept invalid certficates
pub accept_invalid_certs: bool,
}
Expand Down Expand Up @@ -421,7 +490,14 @@ impl Configuration {
&mut self,
wait_for_idle_network: Option<WaitForIdleNetwork>,
) -> &mut Self {
self.wait_for_idle_network = wait_for_idle_network;
match self.wait_for.as_mut() {
Some(wait_for) => wait_for.idle_network = wait_for_idle_network,
_ => {
let mut wait_for = WaitFor::default();
wait_for.idle_network = wait_for_idle_network;
self.wait_for = Some(wait_for);
}
}
self
}

Expand All @@ -434,6 +510,52 @@ impl Configuration {
self
}

#[cfg(feature = "chrome")]
/// Wait for a selector. This method does nothing if the [chrome] feature is not enabled.
pub fn with_wait_for_selector(
&mut self,
wait_for_selector: Option<WaitForSelector>,
) -> &mut Self {
match self.wait_for.as_mut() {
Some(wait_for) => wait_for.selector = wait_for_selector,
_ => {
let mut wait_for = WaitFor::default();
wait_for.selector = wait_for_selector;
self.wait_for = Some(wait_for);
}
}
self
}

#[cfg(not(feature = "chrome"))]
/// Wait for a selector. This method does nothing if the `chrome` feature is not enabled.
pub fn with_wait_for_selector(
&mut self,
_wait_for_selector: Option<WaitForSelector>,
) -> &mut Self {
self
}

#[cfg(feature = "chrome")]
/// Wait for with delay. Should only be used for testing. This method does nothing if the [chrome] feature is not enabled.
pub fn with_wait_for_delay(&mut self, wait_for_delay: Option<WaitForDelay>) -> &mut Self {
match self.wait_for.as_mut() {
Some(wait_for) => wait_for.delay = wait_for_delay,
_ => {
let mut wait_for = WaitFor::default();
wait_for.delay = wait_for_delay;
self.wait_for = Some(wait_for);
}
}
self
}

#[cfg(not(feature = "chrome"))]
/// Wait for with delay. Should only be used for testing. This method does nothing if the [chrome] feature is not enabled.
pub fn with_wait_for_delay(&mut self, _wait_for_delay: Option<WaitForDelay>) -> &mut Self {
self
}

#[cfg(feature = "chrome_intercept")]
/// Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the `chrome_intercept` is not enabled.
pub fn with_chrome_intercept(
Expand Down
5 changes: 2 additions & 3 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,10 +288,9 @@ impl Page {
url: &str,
client: &Client,
page: &chromiumoxide::Page,
wait_for_network_idle: &Option<crate::configuration::WaitForIdleNetwork>,
wait_for: &Option<crate::configuration::WaitFor>,
) -> Self {
let page_resource =
crate::utils::fetch_page_html(&url, &client, &page, wait_for_network_idle).await;
let page_resource = crate::utils::fetch_page_html(&url, &client, &page, wait_for).await;
let mut p = build(url, page_resource);

// store the chrome page to perform actions like screenshots etc.
Expand Down
Loading

0 comments on commit 7932600

Please sign in to comment.