From ee098f2ea1962a2bf06aa893c2641076c8025f39 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sun, 8 Dec 2024 08:41:16 -0500 Subject: [PATCH] chore(chrome): fix bytes timeout --- Cargo.lock | 12 ++++++------ spider/Cargo.toml | 2 +- spider/src/utils/mod.rs | 10 ++-------- spider_chrome/Cargo.toml | 2 +- spider_chrome/src/handler/network.rs | 23 +++++++++++++++++++++-- spider_chrome/src/page.rs | 24 ++---------------------- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 10 files changed, 37 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a5fa19784..b0fbf3327 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4278,7 +4278,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.20.5" +version = "2.20.6" dependencies = [ "ahash", "aho-corasick", @@ -4340,7 +4340,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.20.5" +version = "2.20.6" dependencies = [ "adblock", "async-tungstenite", @@ -4377,7 +4377,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.20.5" +version = "2.20.6" dependencies = [ "clap", "env_logger", @@ -4402,7 +4402,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.20.5" +version = "2.20.6" dependencies = [ "aho-corasick", "fast_html2md", @@ -4424,7 +4424,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.20.5" +version = "2.20.6" dependencies = [ "indexmap 1.9.3", "serde", @@ -4436,7 +4436,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.20.5" +version = "2.20.6" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 9951f8875..b01fe4557 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.20.5" +version = "2.20.6" authors = [ "j-mendez " ] diff --git a/spider/src/utils/mod.rs b/spider/src/utils/mod.rs index 5a6c3bbdd..dda235811 100644 --- a/spider/src/utils/mod.rs +++ b/spider/src/utils/mod.rs @@ -1134,14 +1134,8 @@ pub async fn fetch_page_html_chrome_base( } } - let res = - tokio::time::timeout(tokio::time::Duration::from_secs(15), page.content_bytes()).await; - - let mut res: Box = match res { - Ok(b) => match b { - Ok(b) => b.into(), - _ => Default::default(), - }, + let mut res: Box = match page.content_bytes().await { + Ok(b) => b.into(), _ => Default::default(), }; diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index eac8ad944..12632905a 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.20.5" +version = "2.20.6" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index ee294a69a..7ce5e6d7b 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -132,6 +132,16 @@ lazy_static! { "https://static.parastorage.com/services/tag-manager-client/", "https://www.datadoghq-browser-agent.com/datadog-rum-slim-v4.js", "https://cdn.rudderlabs.com", + "https://script.hotjar.com/", + "https://static.hotjar.com/", + "https://cdn.insurads.com/", + "https://cdn-ukwest.onetrust.com", + "https://cdn.onetrust.com", + "https://services.insurads.com/", + "https://platform.iteratehq.com/loader.js", + "https://acdn.adnxs.com/ast/ast.js", + "https://schibsted-cdn.relevant-digital.com/static/tags/", + "https://bat.bing.net", ".sharethis.com", ".newrelic.com", ".googlesyndication.com", @@ -179,7 +189,9 @@ lazy_static! { "ads.js", "analytics.js", "otSDKStub.js", + "otBannerSdk.js", "_vercel/insights/script.js", + "analytics." ]; for pattern in &patterns { trie.insert(pattern); @@ -212,6 +224,12 @@ lazy_static! { "https://nimbleplot.com", "https://api.lab.amplitude.com/", "https://flag.lab.amplitude.com/sdk/v2/flags", + "https://cdn-ukwest.onetrust.com/", + "https://cdn.onetrust.com/", + "https://geolocation.onetrust.com/", + "https://assets.adobedtm.com/", + "https://sdkconfig.pulse.", + "https://bat.bing.net", ".wixapps.net/api/v1/bulklog", // video embeddings "https://video.squarespace-cdn.com/content/", @@ -219,10 +237,11 @@ lazy_static! { ".doubleclick.net", ".piano.io/", ".browsiprod.com", - ".onetrust.com/consent/", + ".onetrust.", "https://logs.", "/track.php", - "/api/v1/bulklog" + "/api/v1/bulklog", + "cookieconsentpub" ]; for pattern in &patterns { trie.insert(pattern); diff --git a/spider_chrome/src/page.rs b/spider_chrome/src/page.rs index 4bcb01852..ae1da850c 100644 --- a/spider_chrome/src/page.rs +++ b/spider_chrome/src/page.rs @@ -1277,17 +1277,7 @@ impl Page { pub async fn content(&self) -> Result { Ok(self .evaluate( - "{ - let retVal = ''; - if (document.doctype) { - retVal = new XMLSerializer().serializeToString(document.doctype); - } - if (document.documentElement) { - retVal += document.documentElement.outerHTML; - } - retVal - } - ", + "{let retVal = ''; if (document.doctype) { return new XMLSerializer().serializeToString(document.doctype); } if (document.documentElement) { retVal += document.documentElement.outerHTML; } retVal }", ) .await? .into_value()?) @@ -1298,17 +1288,7 @@ impl Page { pub async fn content_bytes(&self) -> Result { Ok(self .evaluate( - "{ - let retVal = ''; - if (document.doctype) { - retVal = new XMLSerializer().serializeToString(document.doctype); - } - if (document.documentElement) { - retVal += document.documentElement.outerHTML; - } - retVal - } - ", + "{let retVal = ''; if (document.doctype) { retVal = new XMLSerializer().serializeToString(document.doctype); } if (document.documentElement) { retVal += document.documentElement.outerHTML; } retVal }", ) .await? .into_value()?) diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 11651eaa9..097e84562 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.20.5" +version = "2.20.6" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 6f7c8f495..fbaf6f636 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.20.5" +version = "2.20.6" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 86fab0b52..011358796 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.20.5" +version = "2.20.6" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 7d1194ab3..8338ccf65 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.20.5" +version = "2.20.6" authors = [ "j-mendez " ]