Skip to content

Commit

Permalink
chore(chrome): add block list items
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 8, 2024
1 parent 1c6905d commit cb9f547
Show file tree
Hide file tree
Showing 12 changed files with 137 additions and 35 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 8 additions & 1 deletion examples/real_world.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,24 @@
extern crate spider;
use crate::spider::tokio::io::AsyncWriteExt;
use spider::features::chrome_common::RequestInterceptConfiguration;
use spider::tokio;
use spider::website::Website;
use spider::{
configuration::WaitForIdleNetwork, features::chrome_common::RequestInterceptConfiguration,
};
use spider_utils::spider_transformations::transformation::content::{
transform_content, ReturnFormat, TransformConfig,
};
use std::io::Result;
use std::time::Duration;

async fn crawl_website(url: &str) -> Result<()> {
let mut website: Website = Website::new(url)
.with_limit(1)
.with_chrome_intercept(RequestInterceptConfiguration::new(true))
.with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis(
200,
)))))
.with_stealth(true)
.with_return_page_links(true)
.with_fingerprint(true)
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.20.3"
version = "2.20.4"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
4 changes: 2 additions & 2 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ lazy_static! {
phf::phf_set! {
"jquery.min.js", "jquery.qtip.min.js", "jquery.js", "angular.js", "jquery.slim.js", "react.development.js", "react-dom.development.js", "react.production.min.js", "react-dom.production.min.js",
"vue.global.js", "vue.global.prod.js", "vue.runtime.", "vue.esm-browser.js", "vue.js", "bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", "d3.js", "material-components-web.min.js",
"otSDKStub.js", "clipboard.min.js", "moment.js", "moment.min.js", "dexie.js", "layui.js", ".js?meteor_js_resource=true",
"otSDKStub.js", "clipboard.min.js", "moment.js", "moment.min.js", "dexie.js", "layui.js", ".js?meteor_js_resource=true", "lodash.min.js", "lodash.js",
// possible js that could be critical.
"app.js", "main.js", "index.js", "bundle.js", "vendor.js",
}
Expand Down Expand Up @@ -80,7 +80,7 @@ lazy_static! {
"react.development.js", "react-dom.development.js", "react.production.min.js",
"react-dom.production.min.js", "vue.global.js", "vue.global.prod.js", "vue.esm-browser.js", "vue.js",
"bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", ".js?meteor_js_resource=true",
"d3.js", "layui.js",
"d3.js", "layui.js", "lodash.min.js", "lodash.js",
"app.js", "main.js", "index.js", "bundle.js", "vendor.js",
// Verified 3rd parties for request
"https://m.stripe.network/inner.html",
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.20.3"
version = "2.20.4"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 2 additions & 0 deletions spider_chrome/src/handler/blockers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ pub mod linkedin_blockers;
pub mod netflix_blockers;
/// tiktok blockers
pub mod tiktok_blockers;
/// upwork blockers
pub mod upwork_blockers;
/// x blockers
pub mod x_blockers;

Expand Down
65 changes: 65 additions & 0 deletions spider_chrome/src/handler/blockers/upwork_blockers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use crate::handler::blockers::Trie;

lazy_static::lazy_static! {
/// Ignore list of urls.
static ref URL_IGNORE_TRIE: Trie = {
let mut trie = Trie::new();
let patterns = [
"https://www.upwork.com/shitake/suit",
"https://www.upwork.com/upi/jslogger",
"https://mpsnare.iesnare.com/5.8.1/logo.js",
"https://first.iovation.com/",
"https://zn0izjiulta2j2t4o-upwork.siteintercept.qualtrics.com/",
"https://cdn123.forter.com/",
"https://www.upwork.com/static/assets/TopNavSsi/visitor-v2/js/manifest.",
"https://www.upwork.com/iojs/general5/static_wdp.js",
"https://www.upwork.com/static/suit2-tracker/",
"https://www.upwork.com/api/graphql/v1?alias=spellCheck",
"https://www.upwork.com/api/graphql/v1?alias=relatedSuggestions",
"https://www.upwork.com/api/graphql/v1?alias=autoSuggestions",
".siteintercept.qualtrics.com/",
".forter.com",
];
for pattern in &patterns {
trie.insert(pattern);
}
trie
};

/// Ignore list of urls.
static ref URL_IGNORE_TRIE_STYLES: Trie = {
let mut trie = Trie::new();
let patterns = [
"https://www.upwork.com/static/assets/TopNavSsi/visitor-v2/",
// 1 missing link needs further looking into for each of the styles
"https://www.upwork.com/static/assets/UniversalSearchNuxt/styles~",
"https://www.upwork.com/static/assets/Brontes/styles",
"https://www.upwork.com/static/assets/Brontes/google-one-tap.6226625d.js"

];
for pattern in &patterns {
trie.insert(pattern);
}
trie
};
}

// Block upwork events that are not required
pub fn block_upwork_styles(
event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused,
) -> bool {
URL_IGNORE_TRIE_STYLES.contains_prefix(&event.request.url)
}

// Block upwork events that are not required
pub fn block_upwork(
event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused,
ignore_visuals: bool,
) -> bool {
let blocked = URL_IGNORE_TRIE.contains_prefix(&event.request.url);
if !blocked && ignore_visuals {
block_upwork_styles(event)
} else {
blocked
}
}
68 changes: 48 additions & 20 deletions spider_chrome/src/handler/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ use chromiumoxide_cdp::cdp::browser_protocol::{
};
use chromiumoxide_types::{Command, Method, MethodId};
use hashbrown::{HashMap, HashSet};
use lazy_static::lazy_static;
use std::collections::VecDeque;
use std::time::Duration;

lazy_static::lazy_static! {
lazy_static! {
/// allowed js frameworks and libs excluding some and adding additional URLs
pub static ref JS_FRAMEWORK_ALLOW: phf::Set<&'static str> = {
phf::phf_set! {
Expand All @@ -30,7 +31,7 @@ lazy_static::lazy_static! {
"react.development.js", "react-dom.development.js", "react.production.min.js",
"react-dom.production.min.js", "vue.global.js", "vue.esm-browser.js", "vue.js",
"bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js",
"d3.js",
"d3.js", "lodash.min.js", "lodash.js",
"app.js", "main.js", "index.js", "bundle.js", "vendor.js",
// Verified 3rd parties for request
"https://m.stripe.network/inner.html",
Expand Down Expand Up @@ -127,6 +128,8 @@ lazy_static::lazy_static! {
"https://cd.connatix.com",
"https://platform-api.sharethis.com/js/sharethis.js",
"https://js.hsforms.net/forms/embed/v2.js",
"https://static.parastorage.com/services/wix-thunderbolt/dist/",
"https://static.parastorage.com/services/tag-manager-client/",
".sharethis.com",
".newrelic.com",
".googlesyndication.com",
Expand Down Expand Up @@ -214,6 +217,7 @@ lazy_static::lazy_static! {
".onetrust.com/consent/",
"https://logs.",
"/track.php",
"/api/v1/bulklog"
];
for pattern in &patterns {
trie.insert(pattern);
Expand All @@ -239,6 +243,8 @@ lazy_static::lazy_static! {
"https://www.youtube.com/player_api", // Youtube player.
"https://www.googletagmanager.com/ns.html", // Google tag manager.
"https://consentcdn.cookiebot.com", // Cookie bot
"https://www.youtube.com/iframe_api", // Youtube iframes.
// "https://www.youtube.com/s/player/", // Youtube player not needed usually since iframe_api is used mainly
// vercel live
"https://vercel.live/api/",

Expand Down Expand Up @@ -360,32 +366,42 @@ pub enum NetworkInterceptManager {
LinkedIn,
/// netflix.com
Netflix,
/// upwork.com,
Upwork,
#[default]
/// Unknown
Unknown,
}

lazy_static! {
/// Top tier list of the most common websites visited.
pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 12] = [
("https://www.tiktok.com", NetworkInterceptManager::TikTok),
("https://tiktok.com", NetworkInterceptManager::TikTok),
("https://www.amazon.com", NetworkInterceptManager::Amazon),
("https://amazon.com", NetworkInterceptManager::Amazon),
("https://www.x.com", NetworkInterceptManager::X),
("https://x.com", NetworkInterceptManager::X),
("https://www.netflix.com", NetworkInterceptManager::Netflix),
("https://netflix.com", NetworkInterceptManager::Netflix),
(
"https://www.linkedin.com",
NetworkInterceptManager::LinkedIn
),
("https://linkedin.com", NetworkInterceptManager::LinkedIn),
("https://www.upwork.com", NetworkInterceptManager::Upwork),
("https://upwork.com", NetworkInterceptManager::Upwork),
];
}

impl NetworkInterceptManager {
/// a custom intercept handle.
pub fn new(url: &str) -> NetworkInterceptManager {
if url.starts_with("https://www.tiktok.com") || url.starts_with("https://tiktok.com") {
NetworkInterceptManager::TikTok
} else if url.starts_with("https://www.amazon.com") || url.starts_with("https://amazon.com")
{
NetworkInterceptManager::Amazon
} else if url.starts_with("https://www.x.com") || url.starts_with("https://x.com") {
NetworkInterceptManager::X
} else if url.starts_with("https://www.netflix.com")
|| url.starts_with("https://netflix.com")
{
NetworkInterceptManager::Netflix
} else if url.starts_with("https://www.linkedin.com")
|| url.starts_with("https://linkedin.com")
{
NetworkInterceptManager::LinkedIn
} else {
NetworkInterceptManager::Unknown
}
TOP_TIER_LIST
.iter()
.find(|&(pattern, _)| url.starts_with(pattern))
.map(|&(_, manager_type)| manager_type)
.unwrap_or(NetworkInterceptManager::Unknown)
}
/// Setup the intercept handle
pub fn setup(&mut self, url: &str) -> Self {
Expand Down Expand Up @@ -720,6 +736,12 @@ impl NetworkManager {
NetworkInterceptManager::LinkedIn => {
super::blockers::linkedin_blockers::block_linkedin(event)
}
NetworkInterceptManager::Upwork => {
super::blockers::upwork_blockers::block_upwork(
event,
self.ignore_visuals,
)
}
_ => skip_networking,
}
} else {
Expand Down Expand Up @@ -828,6 +850,12 @@ impl NetworkManager {
NetworkInterceptManager::LinkedIn => {
super::blockers::linkedin_blockers::block_linkedin(event)
}
NetworkInterceptManager::Upwork => {
super::blockers::upwork_blockers::block_upwork(
event,
self.ignore_visuals,
)
}
_ => skip_networking,
}
} else {
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.20.3"
version = "2.20.4"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.20.3"
version = "2.20.4"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.20.3"
version = "2.20.4"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.20.3"
version = "2.20.4"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down

0 comments on commit cb9f547

Please sign in to comment.