Merge pull request #20 from G8XSU/retry

G8XSU · web-flow · commit c5564e7cc67f · 2023-12-19T11:35:54.000-08:00
Add Retry utility with RetryPolicy definition
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,6 +14,8 @@ build = "build.rs"
 [dependencies]
 prost = "0.11.6"
 reqwest = { version = "0.11.13", features = ["rustls-tls"] }
+tokio = { version = "1", default-features = false, features = ["time"] }
+rand = "0.8.5"
 
 [target.'cfg(genproto)'.build-dependencies]
 prost-build = { version = "0.11.3" }
diff --git a/src/util/mod.rs b/src/util/mod.rs
@@ -2,3 +2,6 @@
 ///
 /// [`StorableBuilder`]: storable_builder::StorableBuilder
 pub mod storable_builder;
+
+/// Contains retry utilities.
+pub mod retry;
diff --git a/src/util/retry.rs b/src/util/retry.rs
@@ -0,0 +1,220 @@
+use rand::Rng;
+use std::error::Error;
+use std::future::Future;
+use std::marker::PhantomData;
+use std::time::Duration;
+
+/// A function that performs and retries the given operation according to a retry policy.
+///
+/// **Caution**: A retry policy without the number of attempts capped by [`MaxAttemptsRetryPolicy`]
+/// decorator will result in infinite retries.
+///
+/// **Example**
+/// ```rust
+/// # use std::time::Duration;
+/// # use vss_client::error::VssError;
+/// # use vss_client::util::retry::{ExponentialBackoffRetryPolicy, retry, RetryPolicy};
+/// #
+/// # async fn operation() -> Result<i32, VssError>  {
+/// # 	tokio::time::sleep(Duration::from_millis(10)).await;
+/// # 	Ok(42)
+/// # }
+/// #
+/// let retry_policy = ExponentialBackoffRetryPolicy::new(Duration::from_millis(100))
+/// 	.with_max_attempts(5)
+/// 	.with_max_total_delay(Duration::from_secs(2))
+/// 	.with_max_jitter(Duration::from_millis(30))
+/// 	.skip_retry_on_error(|e| matches!(e, VssError::InvalidRequestError(..)));
+///
+/// let result = retry(operation, &retry_policy);
+///```
+pub async fn retry<R, F, Fut, T, E>(mut operation: F, retry_policy: &R) -> Result<T, E>
+where
+	R: RetryPolicy<E = E>,
+	F: FnMut() -> Fut,
+	Fut: Future<Output = Result<T, E>>,
+	E: Error,
+{
+	let mut attempts_made = 0;
+	let mut accumulated_delay = Duration::ZERO;
+	loop {
+		match operation().await {
+			Ok(result) => return Ok(result),
+			Err(err) => {
+				attempts_made += 1;
+				if let Some(delay) =
+					retry_policy.next_delay(&RetryContext { attempts_made, accumulated_delay, error: &err })
+				{
+					tokio::time::sleep(delay).await;
+					accumulated_delay += delay;
+				} else {
+					return Err(err);
+				}
+			}
+		}
+	}
+}
+
+/// Provides the logic for how and when to perform retries.
+pub trait RetryPolicy: Sized {
+	/// The error type returned by the `operation` in `retry`.
+	type E: Error;
+
+	/// Returns the duration to wait before trying the next attempt.
+	/// `context` represents the context of a retry operation.
+	///
+	/// If `None` is returned then no further retry attempt is made.
+	fn next_delay(&self, context: &RetryContext<Self::E>) -> Option<Duration>;
+
+	/// Returns a new `RetryPolicy` that respects the given maximum attempts.
+	fn with_max_attempts(self, max_attempts: u32) -> MaxAttemptsRetryPolicy<Self> {
+		MaxAttemptsRetryPolicy { inner_policy: self, max_attempts }
+	}
+
+	/// Returns a new `RetryPolicy` that respects the given total delay.
+	fn with_max_total_delay(self, max_total_delay: Duration) -> MaxTotalDelayRetryPolicy<Self> {
+		MaxTotalDelayRetryPolicy { inner_policy: self, max_total_delay }
+	}
+
+	/// Returns a new `RetryPolicy` that adds jitter(random delay) to underlying policy.
+	fn with_max_jitter(self, max_jitter: Duration) -> JitteredRetryPolicy<Self> {
+		JitteredRetryPolicy { inner_policy: self, max_jitter }
+	}
+
+	/// Skips retrying on errors that evaluate to `true` after applying `function`.
+	fn skip_retry_on_error<F>(self, function: F) -> FilteredRetryPolicy<Self, F>
+	where
+		F: 'static + Fn(&Self::E) -> bool,
+	{
+		FilteredRetryPolicy { inner_policy: self, function }
+	}
+}
+
+/// Represents the context of a retry operation.
+///
+/// The context holds key information about the retry operation
+/// such as how many attempts have been made until now, the accumulated
+/// delay between retries, and the error that triggered the retry.
+pub struct RetryContext<'a, E: Error> {
+	/// The number attempts made until now, before attempting the next retry.
+	attempts_made: u32,
+
+	/// The amount of artificial delay we have already waited in between previous
+	/// attempts. Does not include the time taken to execute the operation.
+	accumulated_delay: Duration,
+
+	/// The error encountered in the previous attempt.
+	error: &'a E,
+}
+
+/// The exponential backoff strategy is a retry approach that doubles the delay between retries.
+/// A combined exponential backoff and jitter strategy is recommended that is ["Exponential Backoff and Jitter"](https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/).
+/// This is helpful to avoid [Thundering Herd Problem](https://en.wikipedia.org/wiki/Thundering_herd_problem).
+pub struct ExponentialBackoffRetryPolicy<E> {
+	/// The base delay duration for the backoff algorithm. First retry is `base_delay` after first attempt.
+	base_delay: Duration,
+	phantom: PhantomData<E>,
+}
+
+impl<E: Error> ExponentialBackoffRetryPolicy<E> {
+	/// Constructs a new instance using `base_delay`.
+	///
+	/// `base_delay` is the base delay duration for the backoff algorithm. First retry is `base_delay`
+	/// after first attempt.
+	pub fn new(base_delay: Duration) -> ExponentialBackoffRetryPolicy<E> {
+		Self { base_delay, phantom: PhantomData }
+	}
+}
+
+impl<E: Error> RetryPolicy for ExponentialBackoffRetryPolicy<E> {
+	type E = E;
+	fn next_delay(&self, context: &RetryContext<Self::E>) -> Option<Duration> {
+		let backoff_factor = 2_u32.pow(context.attempts_made) - 1;
+		let delay = self.base_delay * backoff_factor;
+		Some(delay)
+	}
+}
+
+/// Decorates the given `RetryPolicy` to respect the given maximum attempts.
+pub struct MaxAttemptsRetryPolicy<T: RetryPolicy> {
+	/// The underlying retry policy to use.
+	inner_policy: T,
+	/// The maximum number of attempts to retry.
+	max_attempts: u32,
+}
+
+impl<T: RetryPolicy> RetryPolicy for MaxAttemptsRetryPolicy<T> {
+	type E = T::E;
+	fn next_delay(&self, context: &RetryContext<Self::E>) -> Option<Duration> {
+		if self.max_attempts == context.attempts_made {
+			None
+		} else {
+			self.inner_policy.next_delay(context)
+		}
+	}
+}
+
+/// Decorates the given `RetryPolicy` to respect the given maximum total delay.
+pub struct MaxTotalDelayRetryPolicy<T: RetryPolicy> {
+	/// The underlying retry policy to use.
+	inner_policy: T,
+	/// The maximum accumulated delay that will be allowed over all attempts.
+	max_total_delay: Duration,
+}
+
+impl<T: RetryPolicy> RetryPolicy for MaxTotalDelayRetryPolicy<T> {
+	type E = T::E;
+	fn next_delay(&self, context: &RetryContext<Self::E>) -> Option<Duration> {
+		let next_delay = self.inner_policy.next_delay(context);
+		if let Some(next_delay) = next_delay {
+			if self.max_total_delay < context.accumulated_delay + next_delay {
+				return None;
+			}
+		}
+		next_delay
+	}
+}
+
+/// Decorates the given `RetryPolicy` and adds jitter (random delay) to it. This can make retries
+/// more spread out and less likely to all fail at once.
+pub struct JitteredRetryPolicy<T: RetryPolicy> {
+	/// The underlying retry policy to use.
+	inner_policy: T,
+	/// The maximum amount of random jitter to apply to the delay.
+	max_jitter: Duration,
+}
+
+impl<T: RetryPolicy> RetryPolicy for JitteredRetryPolicy<T> {
+	type E = T::E;
+	fn next_delay(&self, context: &RetryContext<Self::E>) -> Option<Duration> {
+		if let Some(base_delay) = self.inner_policy.next_delay(context) {
+			let mut rng = rand::thread_rng();
+			let jitter = Duration::from_micros(rng.gen_range(0..self.max_jitter.as_micros() as u64));
+			Some(base_delay + jitter)
+		} else {
+			None
+		}
+	}
+}
+
+/// Decorates the given `RetryPolicy` by not retrying on errors that match the given function.
+pub struct FilteredRetryPolicy<T: RetryPolicy, F> {
+	inner_policy: T,
+	function: F,
+}
+
+impl<T, F, E> RetryPolicy for FilteredRetryPolicy<T, F>
+where
+	T: RetryPolicy<E = E>,
+	F: Fn(&E) -> bool,
+	E: Error,
+{
+	type E = T::E;
+	fn next_delay(&self, context: &RetryContext<E>) -> Option<Duration> {
+		if (self.function)(&context.error) {
+			None
+		} else {
+			self.inner_policy.next_delay(context)
+		}
+	}
+}
diff --git a/tests/retry_tests.rs b/tests/retry_tests.rs
@@ -0,0 +1,98 @@
+#[cfg(test)]
+mod retry_tests {
+	use std::io;
+	use std::sync::atomic::{AtomicU32, Ordering};
+	use std::sync::Arc;
+	use std::time::Duration;
+
+	use vss_client::error::VssError;
+	use vss_client::util::retry::{retry, ExponentialBackoffRetryPolicy, RetryPolicy};
+
+	#[tokio::test]
+	async fn test_async_retry() {
+		let base_delay = Duration::from_millis(10);
+		let max_attempts = 3;
+		let max_total_delay = Duration::from_secs(60);
+		let max_jitter = Duration::from_millis(5);
+
+		let exponential_backoff_jitter_policy = ExponentialBackoffRetryPolicy::new(base_delay)
+			.skip_retry_on_error(|e| matches!(e, VssError::InvalidRequestError(..)))
+			.with_max_attempts(max_attempts)
+			.with_max_total_delay(max_total_delay)
+			.with_max_jitter(max_jitter);
+
+		let mut call_count = Arc::new(AtomicU32::new(0));
+		let count = call_count.clone();
+		let async_function = move || {
+			let count = count.clone();
+			async move {
+				let attempts_made = count.fetch_add(1, Ordering::SeqCst);
+				if attempts_made < max_attempts - 1 {
+					return Err(VssError::InternalServerError("Failure".to_string()));
+				}
+				tokio::time::sleep(Duration::from_millis(100)).await;
+				Ok(42)
+			}
+		};
+
+		let result = retry(async_function, &exponential_backoff_jitter_policy).await;
+		assert_eq!(result.ok(), Some(42));
+		assert_eq!(call_count.load(Ordering::SeqCst), max_attempts);
+
+		call_count = Arc::new(AtomicU32::new(0));
+		let count = call_count.clone();
+		let failing_async_function = move || {
+			let count = count.clone();
+			async move {
+				count.fetch_add(1, Ordering::SeqCst);
+				tokio::time::sleep(Duration::from_millis(100)).await;
+				Err::<(), VssError>(VssError::InternalServerError("Failed".to_string()))
+			}
+		};
+
+		let failed_result = retry(failing_async_function, &exponential_backoff_jitter_policy).await;
+		assert!(failed_result.is_err());
+		assert_eq!(call_count.load(Ordering::SeqCst), 3);
+	}
+
+	#[tokio::test]
+	async fn test_retry_on_all_errors() {
+		let retry_policy = ExponentialBackoffRetryPolicy::new(Duration::from_millis(10)).with_max_attempts(3);
+
+		let call_count = Arc::new(AtomicU32::new(0));
+		let count = call_count.clone();
+		let failing_async_function = move || {
+			let count = count.clone();
+			async move {
+				count.fetch_add(1, Ordering::SeqCst);
+				tokio::time::sleep(Duration::from_millis(100)).await;
+				Err::<(), io::Error>(io::Error::new(io::ErrorKind::InvalidData, "Failure"))
+			}
+		};
+
+		let failed_result = retry(failing_async_function, &retry_policy).await;
+		assert!(failed_result.is_err());
+		assert_eq!(call_count.load(Ordering::SeqCst), 3);
+	}
+
+	#[tokio::test]
+	async fn test_retry_capped_by_max_total_delay() {
+		let retry_policy = ExponentialBackoffRetryPolicy::new(Duration::from_millis(100))
+			.with_max_total_delay(Duration::from_millis(350));
+
+		let call_count = Arc::new(AtomicU32::new(0));
+		let count = call_count.clone();
+		let failing_async_function = move || {
+			let count = count.clone();
+			async move {
+				count.fetch_add(1, Ordering::SeqCst);
+				tokio::time::sleep(Duration::from_millis(100)).await;
+				Err::<(), VssError>(VssError::InternalServerError("Failed".to_string()))
+			}
+		};
+
+		let failed_result = retry(failing_async_function, &retry_policy).await;
+		assert!(failed_result.is_err());
+		assert_eq!(call_count.load(Ordering::SeqCst), 2);
+	}
+}