Skip to content

Commit

Permalink
feat(transformations): add spider_transformation crate
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Sep 21, 2024
1 parent 78b81a7 commit 200bc48
Show file tree
Hide file tree
Showing 24 changed files with 8,527 additions and 46 deletions.
225 changes: 192 additions & 33 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ members = [
"spider_worker",
"spider_cli",
"spider_utils",
"spider_transformations",
# internal
"examples",
"benches"]
Expand Down
9 changes: 7 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "2.5.5"
version = "2.6.2"
authors = [
"j-mendez <jeff@spider.cloud>",
]
Expand Down Expand Up @@ -205,4 +205,9 @@ required-features = ["spider/sync"]
[[example]]
name = "css_scrape"
path = "css_scrape.rs"
required-features = ["spider/sync"]
required-features = ["spider/sync"]

[[example]]
name = "transform_markdown"
path = "transform_markdown.rs"
required-features = ["spider/sync", "spider_utils/transformations"]
48 changes: 48 additions & 0 deletions examples/transform_markdown.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
//! `cargo run --example transform_markdown --features="spider/sync spider_utils/transformations"`
extern crate spider;

use spider::tokio;
use spider::website::Website;
use spider_utils::spider_transformations::transformation::content::{
transform_content, ReturnFormat, TransformConfig,
};
use tokio::io::AsyncWriteExt;

#[tokio::main]
async fn main() {
let mut website: Website = Website::new("https://rsseau.fr");
let mut rx2: tokio::sync::broadcast::Receiver<spider::page::Page> =
website.subscribe(0).unwrap();
let mut stdout = tokio::io::stdout();

let mut conf = TransformConfig::default();
conf.return_format = ReturnFormat::Markdown;

let join_handle = tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
let markup = transform_content(&res, &conf, &None, &None);

let _ = stdout
.write_all(format!("- {}\n {}\n", res.get_url(), markup).as_bytes())
.await;
}
stdout
});

let start = std::time::Instant::now();
website.crawl().await;
website.unsubscribe();
let duration = start.elapsed();
let mut stdout = join_handle.await.unwrap();

let _ = stdout
.write_all(
format!(
"Time elapsed in website.crawl() is: {:?} for total pages: {:?}",
duration,
website.get_links().len()
)
.as_bytes(),
)
.await;
}
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.5.5"
version = "2.6.2"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
18 changes: 17 additions & 1 deletion spider/src/utils/header_utils.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::configuration::Configuration;
use reqwest::header::REFERER;
use reqwest::header::{HOST, REFERER};
use reqwest::{
header::{HeaderMap, HeaderValue},
ClientBuilder,
Expand All @@ -10,6 +10,7 @@ pub fn setup_default_headers(
client_builder: ClientBuilder,
configuration: &Configuration,
header_map: HeaderMap,
url: &Option<Box<url::Url>>,
) -> ClientBuilder {
let mut headers = match configuration.headers.clone() {
Some(h) => *h,
Expand All @@ -26,6 +27,21 @@ pub fn setup_default_headers(
}
}

if !headers.contains_key(HOST) {
match url {
Some(u) => {
if let Some(host) = u.host_str() {
if let Ok(ref_value) = HeaderValue::from_str(&host) {
if !ref_value.is_empty() {
headers.insert(HOST, ref_value);
}
}
}
}
_ => (),
}
}

headers.extend(header_map);

client_builder.default_headers(headers)
Expand Down
16 changes: 12 additions & 4 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -786,8 +786,12 @@ impl Website {
client
};

let client =
crate::utils::header_utils::setup_default_headers(client, &self.configuration, headers);
let client = crate::utils::header_utils::setup_default_headers(
client,
&self.configuration,
headers,
self.get_url_parsed(),
);

let mut client = match &self.configuration.request_timeout {
Some(t) => client.timeout(**t),
Expand Down Expand Up @@ -841,8 +845,12 @@ impl Website {
client
};

let client =
crate::utils::header_utils::setup_default_headers(client, &self.configuration, headers);
let client = crate::utils::header_utils::setup_default_headers(
client,
&self.configuration,
headers,
self.get_url_parsed(),
);

let mut client = match &self.configuration.request_timeout {
Some(t) => client.timeout(**t),
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.5.5"
version = "2.6.2"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
Loading

0 comments on commit 200bc48

Please sign in to comment.