Skip to content

Commit

Permalink
chore(website): fix limit budget breaking
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 27, 2024
1 parent da8fbf2 commit 21abbbd
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 39 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.13.74"
version = "2.13.78"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
93 changes: 66 additions & 27 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2212,6 +2212,9 @@ impl Website {

let mut set: JoinSet<HashSet<CaseInsensitiveString>> = JoinSet::new();

// track budgeting one time.
let mut exceeded_budget = false;

'outer: loop {
let stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
links.drain().collect(),
Expand All @@ -2234,8 +2237,10 @@ impl Website {
let allowed = self.is_allowed(&link);

if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
break 'outer;
exceeded_budget = true;
break;
}

if allowed.eq(&ProcessLinkStatus::Blocked) {
continue;
}
Expand Down Expand Up @@ -2312,7 +2317,7 @@ impl Website {
let allowed = self.is_allowed(&s);

if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
break 'outer;
break;
}
if allowed.eq(&ProcessLinkStatus::Blocked) {
continue;
Expand All @@ -2326,7 +2331,7 @@ impl Website {
match result {
Ok(res) => {
// todo: add final url catching domains to make sure we do not add extra pages.
self.links_visited.extend_links(&mut links, res)
self.links_visited.extend_links(&mut links, res);
},
Err(_) => {
break
Expand All @@ -2337,7 +2342,11 @@ impl Website {
else => break,
}

if links.is_empty() && set.is_empty() {
if links.is_empty() && set.is_empty() || exceeded_budget {
// await for all tasks to complete.
if exceeded_budget {
set.join_all().await;
}
break 'outer;
}
}
Expand Down Expand Up @@ -2428,6 +2437,7 @@ impl Website {
let on_link_find_callback = self.on_link_find_callback;
let full_resources = self.configuration.full_resources;
let return_page_links = self.configuration.return_page_links;
let mut exceeded_budget = false;

'outer: loop {
let stream =
Expand Down Expand Up @@ -2461,7 +2471,8 @@ impl Website {
if allowed
.eq(&ProcessLinkStatus::BudgetExceeded)
{
break 'outer;
exceeded_budget = true;
break;
}
if allowed.eq(&ProcessLinkStatus::Blocked) {
continue;
Expand Down Expand Up @@ -2615,9 +2626,8 @@ impl Website {
let s = link.into();
let allowed = self.is_allowed(&s);

if allowed.eq(
&ProcessLinkStatus::BudgetExceeded,
) {
if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
exceeded_budget = true;
break;
}
if allowed
Expand Down Expand Up @@ -2645,7 +2655,10 @@ impl Website {
else => break,
};

if links.is_empty() && set.is_empty() {
if links.is_empty() && set.is_empty() || exceeded_budget {
if exceeded_budget {
set.join_all().await;
}
break 'outer;
}
}
Expand Down Expand Up @@ -2718,6 +2731,7 @@ impl Website {
.await;

let mut set: JoinSet<HashSet<CaseInsensitiveString>> = JoinSet::new();
let mut exceeded_budget = false;

'outer: loop {
let stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
Expand All @@ -2742,7 +2756,8 @@ impl Website {
let allowed = self.is_allowed(&link);

if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
break 'outer;
exceeded_budget = true;
break;
}
if allowed.eq(&ProcessLinkStatus::Blocked) {
continue;
Expand All @@ -2755,7 +2770,6 @@ impl Website {
match SEM.acquire().await {
Ok(permit) => {
let client = client.clone();
tokio::task::yield_now().await;

spawn_set("page_fetch", &mut set, async move {
let link_results = match on_link_find_callback {
Expand Down Expand Up @@ -2790,7 +2804,7 @@ impl Website {
if allowed
.eq(&ProcessLinkStatus::BudgetExceeded)
{
break 'outer;
break;
}
if allowed.eq(&ProcessLinkStatus::Blocked) {
continue;
Expand All @@ -2808,6 +2822,9 @@ impl Website {
}
_ => break,
}
if exceeded_budget {
break;
}
}

while let Some(res) = set.join_next().await {
Expand All @@ -2816,7 +2833,7 @@ impl Website {
}
}

if links.is_empty() {
if links.is_empty() || exceeded_budget {
break;
}
}
Expand Down Expand Up @@ -2895,6 +2912,7 @@ impl Website {
));

let add_external = self.configuration.external_domains_caseless.len() > 0;
let mut exceeded_budget = false;

'outer: loop {
let stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
Expand Down Expand Up @@ -2926,7 +2944,8 @@ impl Website {
let allowed = self.is_allowed(&link);

if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
break 'outer;
exceeded_budget = true;
break;
}
if allowed.eq(&ProcessLinkStatus::Blocked) {
continue;
Expand Down Expand Up @@ -2970,12 +2989,8 @@ impl Website {
let next_page = Page::new_page(url, &shared.0).await;

page.clone_from(&next_page)


};



}).await
{
log::info!("backoff gateway timeout exceeded {elasped}");
Expand Down Expand Up @@ -3044,7 +3059,8 @@ impl Website {
if allowed
.eq(&ProcessLinkStatus::BudgetExceeded)
{
break 'outer;
exceeded_budget = true;
break;
}
if allowed.eq(&ProcessLinkStatus::Blocked) {
continue;
Expand All @@ -3066,7 +3082,10 @@ impl Website {
else => break,
}

if links.is_empty() && set.is_empty() {
if links.is_empty() && set.is_empty() || exceeded_budget {
if exceeded_budget {
set.join_all().await;
}
break 'outer;
}
}
Expand Down Expand Up @@ -3166,6 +3185,7 @@ impl Website {
};

let retry = self.configuration.retry;
let mut exceeded_budget = false;

'outer: loop {
let stream =
Expand Down Expand Up @@ -3226,6 +3246,7 @@ impl Website {
if allowed.eq(
&ProcessLinkStatus::BudgetExceeded,
) {
exceeded_budget = true;
break;
}
if allowed
Expand Down Expand Up @@ -3300,7 +3321,7 @@ impl Website {
Err(err) => log("http parse error: ", err.to_string()),
};
}
Err(err) => log("http network error: ", err.to_string()),
Err(err) => log::info!("http network error: {}", err.to_string()),
};

drop(tx);
Expand All @@ -3327,7 +3348,8 @@ impl Website {
let allowed = self.is_allowed(&s);

if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
break 'outer;
exceeded_budget = true;
break;
}
if allowed.eq(&ProcessLinkStatus::Blocked) {
continue;
Expand All @@ -3340,9 +3362,13 @@ impl Website {
_ => (),
}
}

if exceeded_budget {
break;
}
}

if sitemaps.len() == 0 {
if sitemaps.len() == 0 || exceeded_budget {
break;
}
}
Expand Down Expand Up @@ -3417,6 +3443,8 @@ impl Website {
_ => Default::default(),
};

let mut exceeded_budget = false;

'outer: loop {
let stream = tokio_stream::iter::<Vec<Box<CompactString>>>(
sitemaps.drain(..).collect(),
Expand Down Expand Up @@ -3484,7 +3512,8 @@ impl Website {
self.is_allowed(&link);

if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
break 'outer;
exceeded_budget = true;
break;
}
if allowed.eq(
&ProcessLinkStatus::Blocked,
Expand Down Expand Up @@ -3583,12 +3612,22 @@ impl Website {
err.msg(),
),
};

if exceeded_budget {
break;
}
}
}
Err(err) => log("http parse error: ", err.to_string()),
Err(err) => log::info!(
"http sitemap parse error: {}",
err.to_string()
),
};
}
Err(err) => log("http network error: ", err.to_string()),
Err(err) => log::info!(
"http sitemap network error: {}",
err.to_string()
),
};

drop(tx);
Expand All @@ -3609,7 +3648,7 @@ impl Website {
}
}

if sitemaps.len() == 0 {
if sitemaps.len() == 0 || exceeded_budget {
break;
}
}
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.13.74"
version = "2.13.78"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.13.74"
version = "2.13.78"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.13.74"
version = "2.13.78"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.13.74"
version = "2.13.78"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
Loading

0 comments on commit 21abbbd

Please sign in to comment.