From cb9f5474217183ff5fca3163da22ebdb65a9b8c3 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sun, 8 Dec 2024 06:17:41 -0500 Subject: [PATCH] chore(chrome): add block list items --- Cargo.lock | 12 ++-- examples/real_world.rs | 9 ++- spider/Cargo.toml | 2 +- spider/src/page.rs | 4 +- spider_chrome/Cargo.toml | 2 +- spider_chrome/src/handler/blockers/mod.rs | 2 + .../src/handler/blockers/upwork_blockers.rs | 65 ++++++++++++++++++ spider_chrome/src/handler/network.rs | 68 +++++++++++++------ spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 12 files changed, 137 insertions(+), 35 deletions(-) create mode 100644 spider_chrome/src/handler/blockers/upwork_blockers.rs diff --git a/Cargo.lock b/Cargo.lock index fd5d968428..d1e8332850 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4278,7 +4278,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.20.3" +version = "2.20.4" dependencies = [ "ahash", "aho-corasick", @@ -4340,7 +4340,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.20.3" +version = "2.20.4" dependencies = [ "adblock", "async-tungstenite", @@ -4377,7 +4377,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.20.3" +version = "2.20.4" dependencies = [ "clap", "env_logger", @@ -4402,7 +4402,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.20.3" +version = "2.20.4" dependencies = [ "aho-corasick", "fast_html2md", @@ -4424,7 +4424,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.20.3" +version = "2.20.4" dependencies = [ "indexmap 1.9.3", "serde", @@ -4436,7 +4436,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.20.3" +version = "2.20.4" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/real_world.rs b/examples/real_world.rs index d0c999ddd4..deed1e62e4 100644 --- a/examples/real_world.rs +++ b/examples/real_world.rs @@ -2,17 +2,24 @@ extern crate spider; use crate::spider::tokio::io::AsyncWriteExt; -use spider::features::chrome_common::RequestInterceptConfiguration; use spider::tokio; use spider::website::Website; +use spider::{ + configuration::WaitForIdleNetwork, features::chrome_common::RequestInterceptConfiguration, +}; use spider_utils::spider_transformations::transformation::content::{ transform_content, ReturnFormat, TransformConfig, }; use std::io::Result; +use std::time::Duration; + async fn crawl_website(url: &str) -> Result<()> { let mut website: Website = Website::new(url) .with_limit(1) .with_chrome_intercept(RequestInterceptConfiguration::new(true)) + .with_wait_for_idle_network(Some(WaitForIdleNetwork::new(Some(Duration::from_millis( + 200, + ))))) .with_stealth(true) .with_return_page_links(true) .with_fingerprint(true) diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 19487e6bae..825010a606 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.20.3" +version = "2.20.4" authors = [ "j-mendez " ] diff --git a/spider/src/page.rs b/spider/src/page.rs index 154183f3ee..eb5488c9ec 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -38,7 +38,7 @@ lazy_static! { phf::phf_set! { "jquery.min.js", "jquery.qtip.min.js", "jquery.js", "angular.js", "jquery.slim.js", "react.development.js", "react-dom.development.js", "react.production.min.js", "react-dom.production.min.js", "vue.global.js", "vue.global.prod.js", "vue.runtime.", "vue.esm-browser.js", "vue.js", "bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", "d3.js", "material-components-web.min.js", - "otSDKStub.js", "clipboard.min.js", "moment.js", "moment.min.js", "dexie.js", "layui.js", ".js?meteor_js_resource=true", + "otSDKStub.js", "clipboard.min.js", "moment.js", "moment.min.js", "dexie.js", "layui.js", ".js?meteor_js_resource=true", "lodash.min.js", "lodash.js", // possible js that could be critical. "app.js", "main.js", "index.js", "bundle.js", "vendor.js", } @@ -80,7 +80,7 @@ lazy_static! { "react.development.js", "react-dom.development.js", "react.production.min.js", "react-dom.production.min.js", "vue.global.js", "vue.global.prod.js", "vue.esm-browser.js", "vue.js", "bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", ".js?meteor_js_resource=true", - "d3.js", "layui.js", + "d3.js", "layui.js", "lodash.min.js", "lodash.js", "app.js", "main.js", "index.js", "bundle.js", "vendor.js", // Verified 3rd parties for request "https://m.stripe.network/inner.html", diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 03e467c0d2..91dfd6d1d4 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.20.3" +version = "2.20.4" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_chrome/src/handler/blockers/mod.rs b/spider_chrome/src/handler/blockers/mod.rs index fbadc0c179..76730121f4 100644 --- a/spider_chrome/src/handler/blockers/mod.rs +++ b/spider_chrome/src/handler/blockers/mod.rs @@ -8,6 +8,8 @@ pub mod linkedin_blockers; pub mod netflix_blockers; /// tiktok blockers pub mod tiktok_blockers; +/// upwork blockers +pub mod upwork_blockers; /// x blockers pub mod x_blockers; diff --git a/spider_chrome/src/handler/blockers/upwork_blockers.rs b/spider_chrome/src/handler/blockers/upwork_blockers.rs new file mode 100644 index 0000000000..01f4c2f801 --- /dev/null +++ b/spider_chrome/src/handler/blockers/upwork_blockers.rs @@ -0,0 +1,65 @@ +use crate::handler::blockers::Trie; + +lazy_static::lazy_static! { + /// Ignore list of urls. + static ref URL_IGNORE_TRIE: Trie = { + let mut trie = Trie::new(); + let patterns = [ + "https://www.upwork.com/shitake/suit", + "https://www.upwork.com/upi/jslogger", + "https://mpsnare.iesnare.com/5.8.1/logo.js", + "https://first.iovation.com/", + "https://zn0izjiulta2j2t4o-upwork.siteintercept.qualtrics.com/", + "https://cdn123.forter.com/", + "https://www.upwork.com/static/assets/TopNavSsi/visitor-v2/js/manifest.", + "https://www.upwork.com/iojs/general5/static_wdp.js", + "https://www.upwork.com/static/suit2-tracker/", + "https://www.upwork.com/api/graphql/v1?alias=spellCheck", + "https://www.upwork.com/api/graphql/v1?alias=relatedSuggestions", + "https://www.upwork.com/api/graphql/v1?alias=autoSuggestions", + ".siteintercept.qualtrics.com/", + ".forter.com", + ]; + for pattern in &patterns { + trie.insert(pattern); + } + trie + }; + + /// Ignore list of urls. + static ref URL_IGNORE_TRIE_STYLES: Trie = { + let mut trie = Trie::new(); + let patterns = [ + "https://www.upwork.com/static/assets/TopNavSsi/visitor-v2/", + // 1 missing link needs further looking into for each of the styles + "https://www.upwork.com/static/assets/UniversalSearchNuxt/styles~", + "https://www.upwork.com/static/assets/Brontes/styles", + "https://www.upwork.com/static/assets/Brontes/google-one-tap.6226625d.js" + + ]; + for pattern in &patterns { + trie.insert(pattern); + } + trie + }; +} + +// Block upwork events that are not required +pub fn block_upwork_styles( + event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, +) -> bool { + URL_IGNORE_TRIE_STYLES.contains_prefix(&event.request.url) +} + +// Block upwork events that are not required +pub fn block_upwork( + event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, + ignore_visuals: bool, +) -> bool { + let blocked = URL_IGNORE_TRIE.contains_prefix(&event.request.url); + if !blocked && ignore_visuals { + block_upwork_styles(event) + } else { + blocked + } +} diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index 79d243cc38..5d84cbbb6e 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -18,10 +18,11 @@ use chromiumoxide_cdp::cdp::browser_protocol::{ }; use chromiumoxide_types::{Command, Method, MethodId}; use hashbrown::{HashMap, HashSet}; +use lazy_static::lazy_static; use std::collections::VecDeque; use std::time::Duration; -lazy_static::lazy_static! { +lazy_static! { /// allowed js frameworks and libs excluding some and adding additional URLs pub static ref JS_FRAMEWORK_ALLOW: phf::Set<&'static str> = { phf::phf_set! { @@ -30,7 +31,7 @@ lazy_static::lazy_static! { "react.development.js", "react-dom.development.js", "react.production.min.js", "react-dom.production.min.js", "vue.global.js", "vue.esm-browser.js", "vue.js", "bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", - "d3.js", + "d3.js", "lodash.min.js", "lodash.js", "app.js", "main.js", "index.js", "bundle.js", "vendor.js", // Verified 3rd parties for request "https://m.stripe.network/inner.html", @@ -127,6 +128,8 @@ lazy_static::lazy_static! { "https://cd.connatix.com", "https://platform-api.sharethis.com/js/sharethis.js", "https://js.hsforms.net/forms/embed/v2.js", + "https://static.parastorage.com/services/wix-thunderbolt/dist/", + "https://static.parastorage.com/services/tag-manager-client/", ".sharethis.com", ".newrelic.com", ".googlesyndication.com", @@ -214,6 +217,7 @@ lazy_static::lazy_static! { ".onetrust.com/consent/", "https://logs.", "/track.php", + "/api/v1/bulklog" ]; for pattern in &patterns { trie.insert(pattern); @@ -239,6 +243,8 @@ lazy_static::lazy_static! { "https://www.youtube.com/player_api", // Youtube player. "https://www.googletagmanager.com/ns.html", // Google tag manager. "https://consentcdn.cookiebot.com", // Cookie bot + "https://www.youtube.com/iframe_api", // Youtube iframes. + // "https://www.youtube.com/s/player/", // Youtube player not needed usually since iframe_api is used mainly // vercel live "https://vercel.live/api/", @@ -360,32 +366,42 @@ pub enum NetworkInterceptManager { LinkedIn, /// netflix.com Netflix, + /// upwork.com, + Upwork, #[default] /// Unknown Unknown, } +lazy_static! { + /// Top tier list of the most common websites visited. + pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 12] = [ + ("https://www.tiktok.com", NetworkInterceptManager::TikTok), + ("https://tiktok.com", NetworkInterceptManager::TikTok), + ("https://www.amazon.com", NetworkInterceptManager::Amazon), + ("https://amazon.com", NetworkInterceptManager::Amazon), + ("https://www.x.com", NetworkInterceptManager::X), + ("https://x.com", NetworkInterceptManager::X), + ("https://www.netflix.com", NetworkInterceptManager::Netflix), + ("https://netflix.com", NetworkInterceptManager::Netflix), + ( + "https://www.linkedin.com", + NetworkInterceptManager::LinkedIn + ), + ("https://linkedin.com", NetworkInterceptManager::LinkedIn), + ("https://www.upwork.com", NetworkInterceptManager::Upwork), + ("https://upwork.com", NetworkInterceptManager::Upwork), + ]; +} + impl NetworkInterceptManager { /// a custom intercept handle. pub fn new(url: &str) -> NetworkInterceptManager { - if url.starts_with("https://www.tiktok.com") || url.starts_with("https://tiktok.com") { - NetworkInterceptManager::TikTok - } else if url.starts_with("https://www.amazon.com") || url.starts_with("https://amazon.com") - { - NetworkInterceptManager::Amazon - } else if url.starts_with("https://www.x.com") || url.starts_with("https://x.com") { - NetworkInterceptManager::X - } else if url.starts_with("https://www.netflix.com") - || url.starts_with("https://netflix.com") - { - NetworkInterceptManager::Netflix - } else if url.starts_with("https://www.linkedin.com") - || url.starts_with("https://linkedin.com") - { - NetworkInterceptManager::LinkedIn - } else { - NetworkInterceptManager::Unknown - } + TOP_TIER_LIST + .iter() + .find(|&(pattern, _)| url.starts_with(pattern)) + .map(|&(_, manager_type)| manager_type) + .unwrap_or(NetworkInterceptManager::Unknown) } /// Setup the intercept handle pub fn setup(&mut self, url: &str) -> Self { @@ -720,6 +736,12 @@ impl NetworkManager { NetworkInterceptManager::LinkedIn => { super::blockers::linkedin_blockers::block_linkedin(event) } + NetworkInterceptManager::Upwork => { + super::blockers::upwork_blockers::block_upwork( + event, + self.ignore_visuals, + ) + } _ => skip_networking, } } else { @@ -828,6 +850,12 @@ impl NetworkManager { NetworkInterceptManager::LinkedIn => { super::blockers::linkedin_blockers::block_linkedin(event) } + NetworkInterceptManager::Upwork => { + super::blockers::upwork_blockers::block_upwork( + event, + self.ignore_visuals, + ) + } _ => skip_networking, } } else { diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index e10b3a3dc4..133f555aad 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.20.3" +version = "2.20.4" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 2a9807fce7..37ff1dcfb6 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.20.3" +version = "2.20.4" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index da9ffba916..1513b8469e 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.20.3" +version = "2.20.4" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 89d2f6733e..5d0c291475 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.20.3" +version = "2.20.4" authors = [ "j-mendez " ]