Skip to content

Commit

Permalink
ref(project-upstream): Emit error on multiple project fetch failures (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
iker-barriocanal authored Nov 10, 2023
1 parent a46cd21 commit 84a2526
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
- Add TTID and TTFD tags to mobile spans. ([#2662](https://github.com/getsentry/relay/pull/2662))
- Scrub all DB Core Data spans differently. ([#2686](https://github.com/getsentry/relay/pull/2686))
- Support generic metrics extraction version 2. ([#2692](https://github.com/getsentry/relay/pull/2692))
- Emit error on continued project config fetch failures after a time interval. ([#2700](https://github.com/getsentry/relay/pull/2700))

## 23.10.1

Expand Down
16 changes: 16 additions & 0 deletions relay-config/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,11 @@ struct Http {
///
/// This time is only used before going into a network outage mode.
retry_delay: u64,
/// The interval in seconds for continued failed project fetches at which Relay will error.
///
/// A successful fetch resets this interval. Relay does nothing during long
/// times without emitting requests.
project_failure_interval: u64,
/// Content encoding to apply to upstream store requests.
///
/// By default, Relay applies `gzip` content encoding to compress upstream requests. Compression
Expand Down Expand Up @@ -743,6 +748,7 @@ impl Default for Http {
auth_interval: Some(600), // 10 minutes
outage_grace_period: DEFAULT_NETWORK_OUTAGE_GRACE_PERIOD,
retry_delay: default_retry_delay(),
project_failure_interval: default_project_failure_interval(),
encoding: HttpEncoding::Gzip,
}
}
Expand All @@ -753,6 +759,11 @@ fn default_retry_delay() -> u64 {
1
}

/// Default for project failure interval, 90s.
fn default_project_failure_interval() -> u64 {
90
}

/// Default for max memory size, 500 MB.
fn spool_envelopes_max_memory_size() -> ByteSize {
ByteSize::mebibytes(500)
Expand Down Expand Up @@ -1625,6 +1636,11 @@ impl Config {
Duration::from_secs(self.values.http.retry_delay)
}

/// Time of continued project request failures before Relay emits an error.
pub fn http_project_failure_interval(&self) -> Duration {
Duration::from_secs(self.values.http.project_failure_interval)
}

/// Content encoding of upstream requests.
pub fn http_encoding(&self) -> HttpEncoding {
self.values.http.encoding
Expand Down
34 changes: 33 additions & 1 deletion relay-server/src/actors/project_upstream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,16 @@ pub struct UpstreamProjectSourceService {
inner_tx: mpsc::UnboundedSender<Vec<Option<UpstreamResponse>>>,
inner_rx: mpsc::UnboundedReceiver<Vec<Option<UpstreamResponse>>>,
fetch_handle: SleepHandle,
/// Instant when the last fetch failed, `None` if there aren't any failures.
///
/// Relay updates this value to the instant when the first fetch fails, and
/// resets it to `None` on successful responses. Relay does nothing during
/// long times without requests.
last_failed_fetch: Option<Instant>,
/// Duration of continued fetch fails before emitting an error.
///
/// Relay emits an error if all requests for at least this interval fail.
failure_interval: Duration,
}

impl UpstreamProjectSourceService {
Expand All @@ -181,9 +191,11 @@ impl UpstreamProjectSourceService {
state_channels: HashMap::new(),
fetch_handle: SleepHandle::idle(),
upstream_relay,
config,
inner_tx,
inner_rx,
last_failed_fetch: None,
failure_interval: config.http_project_failure_interval(),
config,
}
}

Expand Down Expand Up @@ -351,6 +363,7 @@ impl UpstreamProjectSourceService {
// Otherwise we might refuse to fetch any project configs because of a
// single, reproducible 500 we observed for a particular project.
self.backoff.reset();
self.last_failed_fetch = None;

// Count number of project states returned (via http requests).
metric!(
Expand Down Expand Up @@ -385,6 +398,8 @@ impl UpstreamProjectSourceService {
}
}
Err(err) => {
self.track_failed_response();

let attempts = channels_batch
.values()
.map(|b| b.attempts)
Expand Down Expand Up @@ -434,6 +449,23 @@ impl UpstreamProjectSourceService {
}
}

/// Tracks the last failed fetch, and emits an error if it exceeds the failure interval.
fn track_failed_response(&mut self) {
match self.last_failed_fetch {
None => self.last_failed_fetch = Some(Instant::now()),
Some(last_failed) => {
let failure_duration = last_failed.elapsed();
if failure_duration >= self.failure_interval {
relay_log::error!(
failure_duration = format!("{} seconds", failure_duration.as_secs()),
backoff_attempts = self.backoff.attempt(),
"can't fetch project states"
);
}
}
}
}

/// Creates the async task to fetch the project states.
fn do_fetch(&mut self) {
self.fetch_handle.reset();
Expand Down

0 comments on commit 84a2526

Please sign in to comment.