From 633f5b3214ffb5c10b8b0de5774ec7901fc4fbae Mon Sep 17 00:00:00 2001 From: Aaron Marten <437496+AaronRM@users.noreply.github.com> Date: Wed, 15 Jan 2025 15:24:32 -0800 Subject: [PATCH 01/27] Fix comment about how to only run unit tests --- opentelemetry-otlp/tests/integration_test/src/test_utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opentelemetry-otlp/tests/integration_test/src/test_utils.rs b/opentelemetry-otlp/tests/integration_test/src/test_utils.rs index 50570d888a..9037dec865 100644 --- a/opentelemetry-otlp/tests/integration_test/src/test_utils.rs +++ b/opentelemetry-otlp/tests/integration_test/src/test_utils.rs @@ -13,7 +13,7 @@ //! Only a single test suite can run at once, as each container has statically mapped ports, but //! this works nicely with the way cargo executes the suite. //! -//! To skip integration tests with cargo, you can run `cargo test --mod`, which will run unit tests +//! To skip integration tests with cargo, you can run `cargo test --lib`, which will run unit tests //! only. //! #![cfg(unix)] From b3115f50cd22876978cd817c518ba7417c953b3d Mon Sep 17 00:00:00 2001 From: Aaron Marten <437496+AaronRM@users.noreply.github.com> Date: Wed, 15 Jan 2025 15:43:25 -0800 Subject: [PATCH 02/27] Remove extraneous lines from test_with_gzip_compression unit test --- opentelemetry-otlp/src/exporter/tonic/mod.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/tonic/mod.rs b/opentelemetry-otlp/src/exporter/tonic/mod.rs index a71a843e40..6986107602 100644 --- a/opentelemetry-otlp/src/exporter/tonic/mod.rs +++ b/opentelemetry-otlp/src/exporter/tonic/mod.rs @@ -564,9 +564,6 @@ mod tests { #[test] #[cfg(feature = "gzip-tonic")] fn test_with_gzip_compression() { - // metadata should merge with the current one with priority instead of just replacing it - let mut metadata = MetadataMap::new(); - metadata.insert("foo", "bar".parse().unwrap()); let builder = TonicExporterBuilder::default().with_compression(Compression::Gzip); assert_eq!(builder.tonic_config.compression.unwrap(), Compression::Gzip); } From 18dae97fc185cac01cd3dfb376a6e9ee1f1c257b Mon Sep 17 00:00:00 2001 From: Aaron Marten <437496+AaronRM@users.noreply.github.com> Date: Thu, 27 Feb 2025 10:38:16 -0800 Subject: [PATCH 03/27] Add retry_with_exponential_backoff method; Use in tonic logs client --- opentelemetry-otlp/src/exporter/http/mod.rs | 2 +- opentelemetry-otlp/src/exporter/tonic/logs.rs | 81 ++++++++++++------- opentelemetry-otlp/src/lib.rs | 1 + opentelemetry-otlp/src/retry.rs | 47 +++++++++++ opentelemetry-proto/src/transform/logs.rs | 8 +- 5 files changed, 105 insertions(+), 34 deletions(-) create mode 100644 opentelemetry-otlp/src/retry.rs diff --git a/opentelemetry-otlp/src/exporter/http/mod.rs b/opentelemetry-otlp/src/exporter/http/mod.rs index 4dbcb603f4..5cb25ea37e 100644 --- a/opentelemetry-otlp/src/exporter/http/mod.rs +++ b/opentelemetry-otlp/src/exporter/http/mod.rs @@ -388,7 +388,7 @@ impl OtlpHttpClient { logs: LogBatch<'_>, ) -> Result<(Vec, &'static str, Option<&'static str>), String> { use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; - let resource_logs = group_logs_by_resource_and_scope(logs, &self.resource); + let resource_logs = group_logs_by_resource_and_scope(&logs, &self.resource); let req = ExportLogsServiceRequest { resource_logs }; let (body, content_type) = match self.protocol { diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index 9f8b9d8a6d..578a35b232 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -1,3 +1,4 @@ +use std::sync::Arc; use core::fmt; use opentelemetry::otel_debug; use opentelemetry_proto::tonic::collector::logs::v1::{ @@ -13,6 +14,8 @@ use opentelemetry_proto::transform::logs::tonic::group_logs_by_resource_and_scop use super::BoxInterceptor; +use crate::retry::{retry_with_exponential_backoff, RetryPolicy}; + pub(crate) struct TonicLogsClient { inner: Mutex>, #[allow(dead_code)] @@ -58,41 +61,61 @@ impl TonicLogsClient { impl LogExporter for TonicLogsClient { async fn export(&self, batch: LogBatch<'_>) -> OTelSdkResult { - let (mut client, metadata, extensions) = match self.inner.lock().await.as_mut() { - Some(inner) => { - let (m, e, _) = inner - .interceptor - .call(Request::new(())) - .map_err(|e| OTelSdkError::InternalFailure(format!("error: {e:?}")))? - .into_parts(); - (inner.client.clone(), m, e) - } - None => return Err(OTelSdkError::AlreadyShutdown), + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, }; - let resource_logs = group_logs_by_resource_and_scope(batch, &self.resource); + let batch = Arc::new(batch); - otel_debug!(name: "TonicLogsClient.ExportStarted"); + retry_with_exponential_backoff(policy, "TonicLogsClient.Export", { + let batch = Arc::clone(&batch); + let inner = &self.inner; + let resource = &self.resource; + move || { + let batch = Arc::clone(&batch); + Box::pin(async move { + let (mut client, metadata, extensions) = match inner.lock().await.as_mut() { + Some(inner) => { + let (m, e, _) = inner + .interceptor + .call(Request::new(())) + .map_err(|e| OTelSdkError::InternalFailure(format!("error: {e:?}")))? + .into_parts(); + (inner.client.clone(), m, e) + } + None => return Err(OTelSdkError::AlreadyShutdown), + }; - let result = client - .export(Request::from_parts( - metadata, - extensions, - ExportLogsServiceRequest { resource_logs }, - )) - .await; + let resource_logs = group_logs_by_resource_and_scope(&*batch, resource); - match result { - Ok(_) => { - otel_debug!(name: "TonicLogsClient.ExportSucceeded"); - Ok(()) - } - Err(e) => { - let error = format!("export error: {e:?}"); - otel_debug!(name: "TonicLogsClient.ExportFailed", error = &error); - Err(OTelSdkError::InternalFailure(error)) + otel_debug!(name: "TonicsLogsClient.ExportStarted"); + + let result = client + .export(Request::from_parts( + metadata, + extensions, + ExportLogsServiceRequest { resource_logs }, + )) + .await; + + match result { + Ok(_) => { + otel_debug!(name: "TonicLogsClient.ExportSucceeded"); + Ok(()) + } + Err(e) => { + let error = format!("export error: {e:?}"); + otel_debug!(name: "TonicLogsClient.ExportFailed", error = &error); + Err(OTelSdkError::InternalFailure(error)) + } + } + }) } - } + }) + .await } fn shutdown_with_timeout(&self, _timeout: time::Duration) -> OTelSdkResult { diff --git a/opentelemetry-otlp/src/lib.rs b/opentelemetry-otlp/src/lib.rs index 3474b2b590..2ddbec77ca 100644 --- a/opentelemetry-otlp/src/lib.rs +++ b/opentelemetry-otlp/src/lib.rs @@ -365,6 +365,7 @@ mod metric; #[cfg(feature = "trace")] #[cfg(any(feature = "http-proto", feature = "http-json", feature = "grpc-tonic"))] mod span; +mod retry; pub use crate::exporter::Compression; pub use crate::exporter::ExportConfig; diff --git a/opentelemetry-otlp/src/retry.rs b/opentelemetry-otlp/src/retry.rs new file mode 100644 index 0000000000..841ade870d --- /dev/null +++ b/opentelemetry-otlp/src/retry.rs @@ -0,0 +1,47 @@ +use std::future::Future; +use std::time::{Duration, SystemTime}; +use opentelemetry::otel_warn; +use tokio::time::sleep; + +pub(crate) struct RetryPolicy { + pub max_retries: usize, + pub initial_delay_ms: u64, + pub max_delay_ms: u64, + pub jitter_ms: u64, +} + +fn generate_jitter(max_jitter: u64) -> u64 { + let now = SystemTime::now(); + let nanos = now.duration_since(SystemTime::UNIX_EPOCH).unwrap().subsec_nanos(); + nanos as u64 % (max_jitter + 1) +} + +pub(crate) async fn retry_with_exponential_backoff( + policy: RetryPolicy, + operation_name: &str, + mut operation: F, +) -> Result +where + F: FnMut() -> Fut, + E: std::fmt::Debug, + Fut: Future>, +{ + let mut attempt = 0; + let mut delay = policy.initial_delay_ms; + + loop { + match operation().await { + Ok(result) => return Ok(result), + Err(err) if attempt < policy.max_retries => { + attempt += 1; + // Log the error and retry after a delay with jitter + otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err)); + let jitter = generate_jitter(policy.jitter_ms); + let delay_with_jitter = std::cmp::min(delay + jitter, policy.max_delay_ms); + sleep(Duration::from_millis(delay_with_jitter)).await; + delay = std::cmp::min(delay * 2, policy.max_delay_ms); + } + Err(err) => return Err(err), + } + } +} \ No newline at end of file diff --git a/opentelemetry-proto/src/transform/logs.rs b/opentelemetry-proto/src/transform/logs.rs index 65d0598216..7d68decf82 100644 --- a/opentelemetry-proto/src/transform/logs.rs +++ b/opentelemetry-proto/src/transform/logs.rs @@ -165,8 +165,8 @@ pub mod tonic { } } - pub fn group_logs_by_resource_and_scope( - logs: LogBatch<'_>, + pub fn group_logs_by_resource_and_scope<'a>( + logs: &'a LogBatch<'a>, resource: &ResourceAttributesWithSchema, ) -> Vec { // Group logs by target or instrumentation name @@ -271,7 +271,7 @@ mod tests { let resource: ResourceAttributesWithSchema = (&resource).into(); // Convert Resource to ResourceAttributesWithSchema let grouped_logs = - crate::transform::logs::tonic::group_logs_by_resource_and_scope(log_batch, &resource); + crate::transform::logs::tonic::group_logs_by_resource_and_scope(&log_batch, &resource); assert_eq!(grouped_logs.len(), 1); let resource_logs = &grouped_logs[0]; @@ -291,7 +291,7 @@ mod tests { let log_batch = LogBatch::new(&logs); let resource: ResourceAttributesWithSchema = (&resource).into(); // Convert Resource to ResourceAttributesWithSchema let grouped_logs = - crate::transform::logs::tonic::group_logs_by_resource_and_scope(log_batch, &resource); + crate::transform::logs::tonic::group_logs_by_resource_and_scope(&log_batch, &resource); assert_eq!(grouped_logs.len(), 1); let resource_logs = &grouped_logs[0]; From d5743c6bea2b6b5f98c240a361d7f9f296e239fa Mon Sep 17 00:00:00 2001 From: Aaron Marten <437496+AaronRM@users.noreply.github.com> Date: Thu, 27 Feb 2025 11:07:20 -0800 Subject: [PATCH 04/27] Add tests and comments to retry.rs --- opentelemetry-otlp/src/retry.rs | 106 +++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 3 deletions(-) diff --git a/opentelemetry-otlp/src/retry.rs b/opentelemetry-otlp/src/retry.rs index 841ade870d..de75f51411 100644 --- a/opentelemetry-otlp/src/retry.rs +++ b/opentelemetry-otlp/src/retry.rs @@ -10,12 +10,14 @@ pub(crate) struct RetryPolicy { pub jitter_ms: u64, } +// Generates a random jitter value up to max_jitter fn generate_jitter(max_jitter: u64) -> u64 { let now = SystemTime::now(); let nanos = now.duration_since(SystemTime::UNIX_EPOCH).unwrap().subsec_nanos(); nanos as u64 % (max_jitter + 1) } +// Retries the given operation with exponential backoff and jitter pub(crate) async fn retry_with_exponential_backoff( policy: RetryPolicy, operation_name: &str, @@ -31,7 +33,7 @@ where loop { match operation().await { - Ok(result) => return Ok(result), + Ok(result) => return Ok(result), // Return the result if the operation succeeds Err(err) if attempt < policy.max_retries => { attempt += 1; // Log the error and retry after a delay with jitter @@ -39,9 +41,107 @@ where let jitter = generate_jitter(policy.jitter_ms); let delay_with_jitter = std::cmp::min(delay + jitter, policy.max_delay_ms); sleep(Duration::from_millis(delay_with_jitter)).await; - delay = std::cmp::min(delay * 2, policy.max_delay_ms); + delay = std::cmp::min(delay * 2, policy.max_delay_ms); // Exponential backoff } - Err(err) => return Err(err), + Err(err) => return Err(err), // Return the error if max retries are reached } } +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::time::timeout; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::time::Duration; + + // Test to ensure generate_jitter returns a value within the expected range + #[tokio::test] + async fn test_generate_jitter() { + let max_jitter = 100; + let jitter = generate_jitter(max_jitter); + assert!(jitter <= max_jitter); + } + + // Test to ensure retry_with_exponential_backoff succeeds on the first attempt + #[tokio::test] + async fn test_retry_with_exponential_backoff_success() { + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let result = retry_with_exponential_backoff(policy, "test_operation", || { + Box::pin(async { Ok::<_, ()>("success") }) + }).await; + + assert_eq!(result, Ok("success")); + } + + // Test to ensure retry_with_exponential_backoff retries the operation and eventually succeeds + #[tokio::test] + async fn test_retry_with_exponential_backoff_retries() { + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let attempts = AtomicUsize::new(0); + + let result = retry_with_exponential_backoff(policy, "test_operation", || { + let attempt = attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async move { + if attempt < 2 { + Err::<&str, &str>("error") // Fail the first two attempts + } else { + Ok::<&str, &str>("success") // Succeed on the third attempt + } + }) + }).await; + + assert_eq!(result, Ok("success")); + assert_eq!(attempts.load(Ordering::SeqCst), 3); // Ensure there were 3 attempts + } + + // Test to ensure retry_with_exponential_backoff fails after max retries + #[tokio::test] + async fn test_retry_with_exponential_backoff_failure() { + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let attempts = AtomicUsize::new(0); + + let result = retry_with_exponential_backoff(policy, "test_operation", || { + attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async { Err::<(), _>("error") }) // Always fail + }).await; + + assert_eq!(result, Err("error")); + assert_eq!(attempts.load(Ordering::SeqCst), 4); // Ensure there were 4 attempts (initial + 3 retries) + } + + // Test to ensure retry_with_exponential_backoff respects the timeout + #[tokio::test] + async fn test_retry_with_exponential_backoff_timeout() { + let policy = RetryPolicy { + max_retries: 12, // Increase the number of retries + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let result = timeout(Duration::from_secs(1), retry_with_exponential_backoff(policy, "test_operation", || { + Box::pin(async { Err::<(), _>("error") }) // Always fail + })).await; + + assert!(result.is_err()); // Ensure the operation times out + } } \ No newline at end of file From 20c137571a9e2e20cada18d8afe5cdc7e0776bd6 Mon Sep 17 00:00:00 2001 From: Aaron Marten <437496+AaronRM@users.noreply.github.com> Date: Thu, 13 Mar 2025 06:25:46 -0700 Subject: [PATCH 05/27] Move retry to opentelemetry-sdk (WIP) --- opentelemetry-otlp/Cargo.toml | 2 +- opentelemetry-otlp/src/exporter/tonic/logs.rs | 4 +- opentelemetry-otlp/src/lib.rs | 1 - opentelemetry-sdk/src/lib.rs | 3 + .../src/retry.rs | 98 +++++++++++++++++-- 5 files changed, 95 insertions(+), 13 deletions(-) rename {opentelemetry-otlp => opentelemetry-sdk}/src/retry.rs (56%) diff --git a/opentelemetry-otlp/Cargo.toml b/opentelemetry-otlp/Cargo.toml index 9acf73a1fb..b345f6955b 100644 --- a/opentelemetry-otlp/Cargo.toml +++ b/opentelemetry-otlp/Cargo.toml @@ -69,7 +69,7 @@ serialize = ["serde", "serde_json"] default = ["http-proto", "reqwest-blocking-client", "trace", "metrics", "logs", "internal-logs"] # grpc using tonic -grpc-tonic = ["tonic", "prost", "http", "tokio", "opentelemetry-proto/gen-tonic"] +grpc-tonic = ["tonic", "prost", "http", "tokio", "opentelemetry-proto/gen-tonic", "opentelemetry_sdk/rt-tokio"] gzip-tonic = ["tonic/gzip"] zstd-tonic = ["tonic/zstd"] diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index 578a35b232..a60857eaf1 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -14,7 +14,7 @@ use opentelemetry_proto::transform::logs::tonic::group_logs_by_resource_and_scop use super::BoxInterceptor; -use crate::retry::{retry_with_exponential_backoff, RetryPolicy}; +use opentelemetry_sdk::retry::{retry_with_exponential_backoff, RetryPolicy}; pub(crate) struct TonicLogsClient { inner: Mutex>, @@ -70,7 +70,7 @@ impl LogExporter for TonicLogsClient { let batch = Arc::new(batch); - retry_with_exponential_backoff(policy, "TonicLogsClient.Export", { + retry_with_exponential_backoff::<_, _, _, _, tokio::time::Sleep>(policy, "TonicLogsClient.Export", { let batch = Arc::clone(&batch); let inner = &self.inner; let resource = &self.resource; diff --git a/opentelemetry-otlp/src/lib.rs b/opentelemetry-otlp/src/lib.rs index 2ddbec77ca..3474b2b590 100644 --- a/opentelemetry-otlp/src/lib.rs +++ b/opentelemetry-otlp/src/lib.rs @@ -365,7 +365,6 @@ mod metric; #[cfg(feature = "trace")] #[cfg(any(feature = "http-proto", feature = "http-json", feature = "grpc-tonic"))] mod span; -mod retry; pub use crate::exporter::Compression; pub use crate::exporter::ExportConfig; diff --git a/opentelemetry-sdk/src/lib.rs b/opentelemetry-sdk/src/lib.rs index 8f8e5a8653..1143e333f8 100644 --- a/opentelemetry-sdk/src/lib.rs +++ b/opentelemetry-sdk/src/lib.rs @@ -167,3 +167,6 @@ impl From> for InMemoryExporterError { InMemoryExporterError::InternalFailure(format!("Mutex poison error: {err}")) } } + +/// Retry logic for exporting telemetry data. +pub mod retry; diff --git a/opentelemetry-otlp/src/retry.rs b/opentelemetry-sdk/src/retry.rs similarity index 56% rename from opentelemetry-otlp/src/retry.rs rename to opentelemetry-sdk/src/retry.rs index de75f51411..c65247c6ec 100644 --- a/opentelemetry-otlp/src/retry.rs +++ b/opentelemetry-sdk/src/retry.rs @@ -1,12 +1,31 @@ +//! This module provides functionality for retrying operations with exponential backoff and jitter. +//! +//! The `RetryPolicy` struct defines the configuration for the retry behavior, including the maximum +//! number of retries, initial delay, maximum delay, and jitter. +//! +//! The `Sleep` trait abstracts the sleep functionality, allowing different implementations for +//! various async runtimes such as Tokio and async-std, as well as a synchronous implementation. +//! +//! The `retry_with_exponential_backoff` function retries the given operation according to the +//! specified retry policy, using exponential backoff and jitter to determine the delay between +//! retries. The function logs errors and retries the operation until it succeeds or the maximum +//! number of retries is reached. + use std::future::Future; +use std::pin::Pin; use std::time::{Duration, SystemTime}; use opentelemetry::otel_warn; -use tokio::time::sleep; -pub(crate) struct RetryPolicy { +/// Configuration for retry policy. +#[derive(Debug)] +pub struct RetryPolicy { + /// Maximum number of retry attempts. pub max_retries: usize, + /// Initial delay in milliseconds before the first retry. pub initial_delay_ms: u64, + /// Maximum delay in milliseconds between retries. pub max_delay_ms: u64, + /// Maximum jitter in milliseconds to add to the delay. pub jitter_ms: u64, } @@ -17,8 +36,68 @@ fn generate_jitter(max_jitter: u64) -> u64 { nanos as u64 % (max_jitter + 1) } -// Retries the given operation with exponential backoff and jitter -pub(crate) async fn retry_with_exponential_backoff( +/// Trait to abstract the sleep functionality. +pub trait Sleep { + /// The future returned by the sleep function. + type SleepFuture: Future; + + /// Sleeps for the specified duration. + fn sleep(duration: Duration) -> Self::SleepFuture; +} + +/// Implementation of the Sleep trait for tokio::time::Sleep +#[cfg(feature = "rt-tokio")] +impl Sleep for tokio::time::Sleep { + type SleepFuture = tokio::time::Sleep; + + fn sleep(duration: Duration) -> Self::SleepFuture { + tokio::time::sleep(duration) + } +} + +#[cfg(feature = "rt-async-std")] +/// There is no direct equivalent to `tokio::time::Sleep` in `async-std`. +/// Instead, we create a new struct `AsyncStdSleep` and implement the `Sleep` +/// trait for it, boxing the future returned by `async_std::task::sleep` to fit +/// the trait's associated type requirements. +#[derive(Debug)] +pub struct AsyncStdSleep; + +/// Implementation of the Sleep trait for async-std +#[cfg(feature = "rt-async-std")] +impl Sleep for AsyncStdSleep { + type SleepFuture = Pin + Send>>; + + fn sleep(duration: Duration) -> Self::SleepFuture { + Box::pin(async_std::task::sleep(duration)) + } +} + +/// Implement the Sleep trait for synchronous sleep +#[derive(Debug)] +pub struct StdSleep; + +impl Sleep for StdSleep { + type SleepFuture = std::future::Ready<()>; + + fn sleep(duration: Duration) -> Self::SleepFuture { + std::thread::sleep(duration); + std::future::ready(()) + } +} + +/// Retries the given operation with exponential backoff and jitter. +/// +/// # Arguments +/// +/// * `policy` - The retry policy configuration. +/// * `operation_name` - The name of the operation being retried. +/// * `operation` - The operation to be retried. +/// +/// # Returns +/// +/// A `Result` containing the operation's result or an error if the maximum retries are reached. +pub async fn retry_with_exponential_backoff( policy: RetryPolicy, operation_name: &str, mut operation: F, @@ -27,6 +106,7 @@ where F: FnMut() -> Fut, E: std::fmt::Debug, Fut: Future>, + S: Sleep, { let mut attempt = 0; let mut delay = policy.initial_delay_ms; @@ -40,7 +120,7 @@ where otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err)); let jitter = generate_jitter(policy.jitter_ms); let delay_with_jitter = std::cmp::min(delay + jitter, policy.max_delay_ms); - sleep(Duration::from_millis(delay_with_jitter)).await; + S::sleep(Duration::from_millis(delay_with_jitter)).await; delay = std::cmp::min(delay * 2, policy.max_delay_ms); // Exponential backoff } Err(err) => return Err(err), // Return the error if max retries are reached @@ -73,7 +153,7 @@ mod tests { jitter_ms: 100, }; - let result = retry_with_exponential_backoff(policy, "test_operation", || { + let result = retry_with_exponential_backoff::<_, _, _, _, tokio::time::Sleep>(policy, "test_operation", || { Box::pin(async { Ok::<_, ()>("success") }) }).await; @@ -92,7 +172,7 @@ mod tests { let attempts = AtomicUsize::new(0); - let result = retry_with_exponential_backoff(policy, "test_operation", || { + let result = retry_with_exponential_backoff::<_, _, _, _, tokio::time::Sleep>(policy, "test_operation", || { let attempt = attempts.fetch_add(1, Ordering::SeqCst); Box::pin(async move { if attempt < 2 { @@ -119,7 +199,7 @@ mod tests { let attempts = AtomicUsize::new(0); - let result = retry_with_exponential_backoff(policy, "test_operation", || { + let result = retry_with_exponential_backoff::<_, _, _, _, tokio::time::Sleep>(policy, "test_operation", || { attempts.fetch_add(1, Ordering::SeqCst); Box::pin(async { Err::<(), _>("error") }) // Always fail }).await; @@ -138,7 +218,7 @@ mod tests { jitter_ms: 100, }; - let result = timeout(Duration::from_secs(1), retry_with_exponential_backoff(policy, "test_operation", || { + let result = timeout(Duration::from_secs(1), retry_with_exponential_backoff::<_, _, _, _, tokio::time::Sleep>(policy, "test_operation", || { Box::pin(async { Err::<(), _>("error") }) // Always fail })).await; From b3e17ef1791f2449a9ff66e4fdf9e126f1ecfb78 Mon Sep 17 00:00:00 2001 From: Aaron Marten <437496+AaronRM@users.noreply.github.com> Date: Thu, 27 Mar 2025 16:16:09 -0700 Subject: [PATCH 06/27] Fix build warning --- opentelemetry-otlp/src/exporter/tonic/logs.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index a8427dffaf..a6c784437d 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -6,7 +6,6 @@ use opentelemetry_proto::tonic::collector::logs::v1::{ }; use opentelemetry_sdk::error::{OTelSdkError, OTelSdkResult}; use opentelemetry_sdk::logs::{LogBatch, LogExporter}; -use std::time; use tokio::sync::Mutex; use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Channel, Request}; @@ -118,11 +117,9 @@ impl LogExporter for TonicLogsClient { .await } - fn shutdown(&mut self) -> OTelSdkResult { - match self.inner.take() { - Some(_) => Ok(()), // Successfully took `inner`, indicating a successful shutdown. - None => Err(OTelSdkError::AlreadyShutdown), // `inner` was already `None`, meaning it's already shut down. - } + fn shutdown(&self) -> OTelSdkResult { + // TODO: We broke this rebasing. fix it! + Ok(()) } fn set_resource(&mut self, resource: &opentelemetry_sdk::Resource) { From edcc00cd488cabfa472b2464007ef30c4243a8f0 Mon Sep 17 00:00:00 2001 From: Aaron Marten <437496+AaronRM@users.noreply.github.com> Date: Thu, 27 Mar 2025 16:16:32 -0700 Subject: [PATCH 07/27] Scope retry to just logs+tonic --- opentelemetry-otlp/src/exporter/tonic/logs.rs | 10 +- opentelemetry-otlp/src/exporter/tonic/mod.rs | 5 + .../src/exporter/tonic/retry.rs | 229 ++++++++++++++++++ 3 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 opentelemetry-otlp/src/exporter/tonic/retry.rs diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index a6c784437d..f67b7b4066 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -90,7 +90,7 @@ impl LogExporter for TonicLogsClient { let resource_logs = group_logs_by_resource_and_scope(&*batch, resource); - otel_debug!(name: "TonicsLogsClient.ExportStarted"); + otel_debug!(name: "TonicLogsClient.ExportStarted"); let result = client .export(Request::from_parts( @@ -118,7 +118,13 @@ impl LogExporter for TonicLogsClient { } fn shutdown(&self) -> OTelSdkResult { - // TODO: We broke this rebasing. fix it! + // TODO: Implement actual shutdown + // Due to the use of tokio::sync::Mutex to guard + // the inner client, we need to await the call to lock the mutex + // and that requires async runtime. + // It is possible to fix this by using + // a dedicated thread just to handle shutdown. + // But for now, we just return Ok. Ok(()) } diff --git a/opentelemetry-otlp/src/exporter/tonic/mod.rs b/opentelemetry-otlp/src/exporter/tonic/mod.rs index 6986107602..08ff858c36 100644 --- a/opentelemetry-otlp/src/exporter/tonic/mod.rs +++ b/opentelemetry-otlp/src/exporter/tonic/mod.rs @@ -25,6 +25,11 @@ pub(crate) mod metrics; #[cfg(feature = "trace")] pub(crate) mod trace; +// For now, we are not exposing the retry policy. Only work with grpc-tonic since retry takes a hard dependency on tokio +// while we sort out an abstraction for the async runtime which can be used by all exporters. +#[cfg(feature = "grpc-tonic")] +mod retry; + /// Configuration for [tonic] /// /// [tonic]: https://github.com/hyperium/tonic diff --git a/opentelemetry-otlp/src/exporter/tonic/retry.rs b/opentelemetry-otlp/src/exporter/tonic/retry.rs new file mode 100644 index 0000000000..d9a283a463 --- /dev/null +++ b/opentelemetry-otlp/src/exporter/tonic/retry.rs @@ -0,0 +1,229 @@ +//! This module provides functionality for retrying operations with exponential backoff and jitter. +//! +//! The `RetryPolicy` struct defines the configuration for the retry behavior, including the maximum +//! number of retries, initial delay, maximum delay, and jitter. +//! +//! The `Sleep` trait abstracts the sleep functionality, allowing different implementations for +//! various async runtimes such as Tokio and async-std, as well as a synchronous implementation. +//! +//! The `retry_with_exponential_backoff` function retries the given operation according to the +//! specified retry policy, using exponential backoff and jitter to determine the delay between +//! retries. The function logs errors and retries the operation until it succeeds or the maximum +//! number of retries is reached. + +use std::future::Future; +use std::time::{Duration, SystemTime}; +use opentelemetry::otel_warn; + +/// Configuration for retry policy. +#[derive(Debug)] +pub(super) struct RetryPolicy { + /// Maximum number of retry attempts. + pub max_retries: usize, + /// Initial delay in milliseconds before the first retry. + pub initial_delay_ms: u64, + /// Maximum delay in milliseconds between retries. + pub max_delay_ms: u64, + /// Maximum jitter in milliseconds to add to the delay. + pub jitter_ms: u64, +} + +// Generates a random jitter value up to max_jitter +fn generate_jitter(max_jitter: u64) -> u64 { + let now = SystemTime::now(); + let nanos = now.duration_since(SystemTime::UNIX_EPOCH).unwrap().subsec_nanos(); + nanos as u64 % (max_jitter + 1) +} + +// /// Trait to abstract the sleep functionality. +// pub trait Sleep { +// /// The future returned by the sleep function. +// type SleepFuture: Future; + +// /// Sleeps for the specified duration. +// fn sleep(duration: Duration) -> Self::SleepFuture; +// } + +// /// Implementation of the Sleep trait for tokio::time::Sleep +// #[cfg(feature = "rt-tokio")] +// impl Sleep for tokio::time::Sleep { +// type SleepFuture = tokio::time::Sleep; + +// fn sleep(duration: Duration) -> Self::SleepFuture { +// } +// } + +// #[cfg(feature = "rt-async-std")] +// /// There is no direct equivalent to `tokio::time::Sleep` in `async-std`. +// /// Instead, we create a new struct `AsyncStdSleep` and implement the `Sleep` +// /// trait for it, boxing the future returned by `async_std::task::sleep` to fit +// /// the trait's associated type requirements. +// #[derive(Debug)] +// pub struct AsyncStdSleep; + +// /// Implementation of the Sleep trait for async-std +// #[cfg(feature = "rt-async-std")] +// impl Sleep for AsyncStdSleep { +// type SleepFuture = Pin + Send>>; + +// fn sleep(duration: Duration) -> Self::SleepFuture { +// Box::pin(async_std::task::sleep(duration)) +// } +// } + +// /// Implement the Sleep trait for synchronous sleep +// #[derive(Debug)] +// pub struct StdSleep; + +// impl Sleep for StdSleep { +// type SleepFuture = std::future::Ready<()>; + +// fn sleep(duration: Duration) -> Self::SleepFuture { +// std::thread::sleep(duration); +// std::future::ready(()) +// } +// } + +/// Retries the given operation with exponential backoff and jitter. +/// +/// # Arguments +/// +/// * `policy` - The retry policy configuration. +/// * `operation_name` - The name of the operation being retried. +/// * `operation` - The operation to be retried. +/// +/// # Returns +/// +/// A `Result` containing the operation's result or an error if the maximum retries are reached. +pub(super) async fn retry_with_exponential_backoff( + policy: RetryPolicy, + operation_name: &str, + mut operation: F, +) -> Result +where + F: FnMut() -> Fut, + E: std::fmt::Debug, + Fut: Future>, +{ + let mut attempt = 0; + let mut delay = policy.initial_delay_ms; + + loop { + match operation().await { + Ok(result) => return Ok(result), // Return the result if the operation succeeds + Err(err) if attempt < policy.max_retries => { + attempt += 1; + // Log the error and retry after a delay with jitter + otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err)); + let jitter = generate_jitter(policy.jitter_ms); + let delay_with_jitter = std::cmp::min(delay + jitter, policy.max_delay_ms); + + // Retry currently only supports tokio::time::sleep (for use with gRPC/tonic). This + // should be replaced with a more generic sleep function that works with async-std + // and a synchronous runtime in the future. + tokio::time::sleep(Duration::from_millis(delay_with_jitter)).await; + + delay = std::cmp::min(delay * 2, policy.max_delay_ms); // Exponential backoff + } + Err(err) => return Err(err), // Return the error if max retries are reached + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::time::timeout; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::time::Duration; + + // Test to ensure generate_jitter returns a value within the expected range + #[tokio::test] + async fn test_generate_jitter() { + let max_jitter = 100; + let jitter = generate_jitter(max_jitter); + assert!(jitter <= max_jitter); + } + + // Test to ensure retry_with_exponential_backoff succeeds on the first attempt + #[tokio::test] + async fn test_retry_with_exponential_backoff_success() { + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let result = retry_with_exponential_backoff(policy, "test_operation", || { + Box::pin(async { Ok::<_, ()>("success") }) + }).await; + + assert_eq!(result, Ok("success")); + } + + // Test to ensure retry_with_exponential_backoff retries the operation and eventually succeeds + #[tokio::test] + async fn test_retry_with_exponential_backoff_retries() { + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let attempts = AtomicUsize::new(0); + + let result = retry_with_exponential_backoff(policy, "test_operation", || { + let attempt = attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async move { + if attempt < 2 { + Err::<&str, &str>("error") // Fail the first two attempts + } else { + Ok::<&str, &str>("success") // Succeed on the third attempt + } + }) + }).await; + + assert_eq!(result, Ok("success")); + assert_eq!(attempts.load(Ordering::SeqCst), 3); // Ensure there were 3 attempts + } + + // Test to ensure retry_with_exponential_backoff fails after max retries + #[tokio::test] + async fn test_retry_with_exponential_backoff_failure() { + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let attempts = AtomicUsize::new(0); + + let result = retry_with_exponential_backoff(policy, "test_operation", || { + attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async { Err::<(), _>("error") }) // Always fail + }).await; + + assert_eq!(result, Err("error")); + assert_eq!(attempts.load(Ordering::SeqCst), 4); // Ensure there were 4 attempts (initial + 3 retries) + } + + // Test to ensure retry_with_exponential_backoff respects the timeout + #[tokio::test] + async fn test_retry_with_exponential_backoff_timeout() { + let policy = RetryPolicy { + max_retries: 12, // Increase the number of retries + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let result = timeout(Duration::from_secs(1), retry_with_exponential_backoff(policy, "test_operation", || { + Box::pin(async { Err::<(), _>("error") }) // Always fail + })).await; + + assert!(result.is_err()); // Ensure the operation times out + } +} \ No newline at end of file From ee75730141d1174c5f8bfdef5885af98838c7567 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Mon, 11 Aug 2025 17:33:34 +0200 Subject: [PATCH 08/27] clean up after rebase --- opentelemetry-otlp/src/exporter/tonic/logs.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index f67b7b4066..801e37bd99 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -6,6 +6,7 @@ use opentelemetry_proto::tonic::collector::logs::v1::{ }; use opentelemetry_sdk::error::{OTelSdkError, OTelSdkResult}; use opentelemetry_sdk::logs::{LogBatch, LogExporter}; +use std::time; use tokio::sync::Mutex; use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Channel, Request}; @@ -117,7 +118,7 @@ impl LogExporter for TonicLogsClient { .await } - fn shutdown(&self) -> OTelSdkResult { + fn shutdown_with_timeout(&self, _timeout: time::Duration) -> OTelSdkResult { // TODO: Implement actual shutdown // Due to the use of tokio::sync::Mutex to guard // the inner client, we need to await the call to lock the mutex From bcdc17974df81500c384174115288f2c18fc48c8 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 12 Aug 2025 09:15:43 +0200 Subject: [PATCH 09/27] migrate to the async runtime abstraction --- opentelemetry-otlp/Cargo.toml | 2 +- opentelemetry-otlp/src/exporter/tonic/logs.rs | 9 +- opentelemetry-otlp/src/exporter/tonic/mod.rs | 5 - .../src/exporter/tonic/retry.rs | 229 ------------------ opentelemetry-sdk/src/retry.rs | 146 +++++------ 5 files changed, 82 insertions(+), 309 deletions(-) delete mode 100644 opentelemetry-otlp/src/exporter/tonic/retry.rs diff --git a/opentelemetry-otlp/Cargo.toml b/opentelemetry-otlp/Cargo.toml index b345f6955b..3d28ea9de2 100644 --- a/opentelemetry-otlp/Cargo.toml +++ b/opentelemetry-otlp/Cargo.toml @@ -69,7 +69,7 @@ serialize = ["serde", "serde_json"] default = ["http-proto", "reqwest-blocking-client", "trace", "metrics", "logs", "internal-logs"] # grpc using tonic -grpc-tonic = ["tonic", "prost", "http", "tokio", "opentelemetry-proto/gen-tonic", "opentelemetry_sdk/rt-tokio"] +grpc-tonic = ["tonic", "prost", "http", "tokio", "opentelemetry-proto/gen-tonic", "opentelemetry_sdk/rt-tokio", "opentelemetry_sdk/experimental_async_runtime"] gzip-tonic = ["tonic/gzip"] zstd-tonic = ["tonic/zstd"] diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index 801e37bd99..62d58548f0 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -1,4 +1,3 @@ -use std::sync::Arc; use core::fmt; use opentelemetry::otel_debug; use opentelemetry_proto::tonic::collector::logs::v1::{ @@ -6,6 +5,7 @@ use opentelemetry_proto::tonic::collector::logs::v1::{ }; use opentelemetry_sdk::error::{OTelSdkError, OTelSdkResult}; use opentelemetry_sdk::logs::{LogBatch, LogExporter}; +use std::sync::Arc; use std::time; use tokio::sync::Mutex; use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Channel, Request}; @@ -15,6 +15,7 @@ use opentelemetry_proto::transform::logs::tonic::group_logs_by_resource_and_scop use super::BoxInterceptor; use opentelemetry_sdk::retry::{retry_with_exponential_backoff, RetryPolicy}; +use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicLogsClient { inner: Mutex>, @@ -70,7 +71,7 @@ impl LogExporter for TonicLogsClient { let batch = Arc::new(batch); - retry_with_exponential_backoff::<_, _, _, _, tokio::time::Sleep>(policy, "TonicLogsClient.Export", { + retry_with_exponential_backoff(Tokio, policy, "TonicLogsClient.Export", { let batch = Arc::clone(&batch); let inner = &self.inner; let resource = &self.resource; @@ -82,7 +83,9 @@ impl LogExporter for TonicLogsClient { let (m, e, _) = inner .interceptor .call(Request::new(())) - .map_err(|e| OTelSdkError::InternalFailure(format!("error: {e:?}")))? + .map_err(|e| { + OTelSdkError::InternalFailure(format!("error: {e:?}")) + })? .into_parts(); (inner.client.clone(), m, e) } diff --git a/opentelemetry-otlp/src/exporter/tonic/mod.rs b/opentelemetry-otlp/src/exporter/tonic/mod.rs index 08ff858c36..6986107602 100644 --- a/opentelemetry-otlp/src/exporter/tonic/mod.rs +++ b/opentelemetry-otlp/src/exporter/tonic/mod.rs @@ -25,11 +25,6 @@ pub(crate) mod metrics; #[cfg(feature = "trace")] pub(crate) mod trace; -// For now, we are not exposing the retry policy. Only work with grpc-tonic since retry takes a hard dependency on tokio -// while we sort out an abstraction for the async runtime which can be used by all exporters. -#[cfg(feature = "grpc-tonic")] -mod retry; - /// Configuration for [tonic] /// /// [tonic]: https://github.com/hyperium/tonic diff --git a/opentelemetry-otlp/src/exporter/tonic/retry.rs b/opentelemetry-otlp/src/exporter/tonic/retry.rs deleted file mode 100644 index d9a283a463..0000000000 --- a/opentelemetry-otlp/src/exporter/tonic/retry.rs +++ /dev/null @@ -1,229 +0,0 @@ -//! This module provides functionality for retrying operations with exponential backoff and jitter. -//! -//! The `RetryPolicy` struct defines the configuration for the retry behavior, including the maximum -//! number of retries, initial delay, maximum delay, and jitter. -//! -//! The `Sleep` trait abstracts the sleep functionality, allowing different implementations for -//! various async runtimes such as Tokio and async-std, as well as a synchronous implementation. -//! -//! The `retry_with_exponential_backoff` function retries the given operation according to the -//! specified retry policy, using exponential backoff and jitter to determine the delay between -//! retries. The function logs errors and retries the operation until it succeeds or the maximum -//! number of retries is reached. - -use std::future::Future; -use std::time::{Duration, SystemTime}; -use opentelemetry::otel_warn; - -/// Configuration for retry policy. -#[derive(Debug)] -pub(super) struct RetryPolicy { - /// Maximum number of retry attempts. - pub max_retries: usize, - /// Initial delay in milliseconds before the first retry. - pub initial_delay_ms: u64, - /// Maximum delay in milliseconds between retries. - pub max_delay_ms: u64, - /// Maximum jitter in milliseconds to add to the delay. - pub jitter_ms: u64, -} - -// Generates a random jitter value up to max_jitter -fn generate_jitter(max_jitter: u64) -> u64 { - let now = SystemTime::now(); - let nanos = now.duration_since(SystemTime::UNIX_EPOCH).unwrap().subsec_nanos(); - nanos as u64 % (max_jitter + 1) -} - -// /// Trait to abstract the sleep functionality. -// pub trait Sleep { -// /// The future returned by the sleep function. -// type SleepFuture: Future; - -// /// Sleeps for the specified duration. -// fn sleep(duration: Duration) -> Self::SleepFuture; -// } - -// /// Implementation of the Sleep trait for tokio::time::Sleep -// #[cfg(feature = "rt-tokio")] -// impl Sleep for tokio::time::Sleep { -// type SleepFuture = tokio::time::Sleep; - -// fn sleep(duration: Duration) -> Self::SleepFuture { -// } -// } - -// #[cfg(feature = "rt-async-std")] -// /// There is no direct equivalent to `tokio::time::Sleep` in `async-std`. -// /// Instead, we create a new struct `AsyncStdSleep` and implement the `Sleep` -// /// trait for it, boxing the future returned by `async_std::task::sleep` to fit -// /// the trait's associated type requirements. -// #[derive(Debug)] -// pub struct AsyncStdSleep; - -// /// Implementation of the Sleep trait for async-std -// #[cfg(feature = "rt-async-std")] -// impl Sleep for AsyncStdSleep { -// type SleepFuture = Pin + Send>>; - -// fn sleep(duration: Duration) -> Self::SleepFuture { -// Box::pin(async_std::task::sleep(duration)) -// } -// } - -// /// Implement the Sleep trait for synchronous sleep -// #[derive(Debug)] -// pub struct StdSleep; - -// impl Sleep for StdSleep { -// type SleepFuture = std::future::Ready<()>; - -// fn sleep(duration: Duration) -> Self::SleepFuture { -// std::thread::sleep(duration); -// std::future::ready(()) -// } -// } - -/// Retries the given operation with exponential backoff and jitter. -/// -/// # Arguments -/// -/// * `policy` - The retry policy configuration. -/// * `operation_name` - The name of the operation being retried. -/// * `operation` - The operation to be retried. -/// -/// # Returns -/// -/// A `Result` containing the operation's result or an error if the maximum retries are reached. -pub(super) async fn retry_with_exponential_backoff( - policy: RetryPolicy, - operation_name: &str, - mut operation: F, -) -> Result -where - F: FnMut() -> Fut, - E: std::fmt::Debug, - Fut: Future>, -{ - let mut attempt = 0; - let mut delay = policy.initial_delay_ms; - - loop { - match operation().await { - Ok(result) => return Ok(result), // Return the result if the operation succeeds - Err(err) if attempt < policy.max_retries => { - attempt += 1; - // Log the error and retry after a delay with jitter - otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err)); - let jitter = generate_jitter(policy.jitter_ms); - let delay_with_jitter = std::cmp::min(delay + jitter, policy.max_delay_ms); - - // Retry currently only supports tokio::time::sleep (for use with gRPC/tonic). This - // should be replaced with a more generic sleep function that works with async-std - // and a synchronous runtime in the future. - tokio::time::sleep(Duration::from_millis(delay_with_jitter)).await; - - delay = std::cmp::min(delay * 2, policy.max_delay_ms); // Exponential backoff - } - Err(err) => return Err(err), // Return the error if max retries are reached - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use tokio::time::timeout; - use std::sync::atomic::{AtomicUsize, Ordering}; - use std::time::Duration; - - // Test to ensure generate_jitter returns a value within the expected range - #[tokio::test] - async fn test_generate_jitter() { - let max_jitter = 100; - let jitter = generate_jitter(max_jitter); - assert!(jitter <= max_jitter); - } - - // Test to ensure retry_with_exponential_backoff succeeds on the first attempt - #[tokio::test] - async fn test_retry_with_exponential_backoff_success() { - let policy = RetryPolicy { - max_retries: 3, - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, - }; - - let result = retry_with_exponential_backoff(policy, "test_operation", || { - Box::pin(async { Ok::<_, ()>("success") }) - }).await; - - assert_eq!(result, Ok("success")); - } - - // Test to ensure retry_with_exponential_backoff retries the operation and eventually succeeds - #[tokio::test] - async fn test_retry_with_exponential_backoff_retries() { - let policy = RetryPolicy { - max_retries: 3, - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, - }; - - let attempts = AtomicUsize::new(0); - - let result = retry_with_exponential_backoff(policy, "test_operation", || { - let attempt = attempts.fetch_add(1, Ordering::SeqCst); - Box::pin(async move { - if attempt < 2 { - Err::<&str, &str>("error") // Fail the first two attempts - } else { - Ok::<&str, &str>("success") // Succeed on the third attempt - } - }) - }).await; - - assert_eq!(result, Ok("success")); - assert_eq!(attempts.load(Ordering::SeqCst), 3); // Ensure there were 3 attempts - } - - // Test to ensure retry_with_exponential_backoff fails after max retries - #[tokio::test] - async fn test_retry_with_exponential_backoff_failure() { - let policy = RetryPolicy { - max_retries: 3, - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, - }; - - let attempts = AtomicUsize::new(0); - - let result = retry_with_exponential_backoff(policy, "test_operation", || { - attempts.fetch_add(1, Ordering::SeqCst); - Box::pin(async { Err::<(), _>("error") }) // Always fail - }).await; - - assert_eq!(result, Err("error")); - assert_eq!(attempts.load(Ordering::SeqCst), 4); // Ensure there were 4 attempts (initial + 3 retries) - } - - // Test to ensure retry_with_exponential_backoff respects the timeout - #[tokio::test] - async fn test_retry_with_exponential_backoff_timeout() { - let policy = RetryPolicy { - max_retries: 12, // Increase the number of retries - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, - }; - - let result = timeout(Duration::from_secs(1), retry_with_exponential_backoff(policy, "test_operation", || { - Box::pin(async { Err::<(), _>("error") }) // Always fail - })).await; - - assert!(result.is_err()); // Ensure the operation times out - } -} \ No newline at end of file diff --git a/opentelemetry-sdk/src/retry.rs b/opentelemetry-sdk/src/retry.rs index c65247c6ec..59e9fc0c0a 100644 --- a/opentelemetry-sdk/src/retry.rs +++ b/opentelemetry-sdk/src/retry.rs @@ -3,18 +3,20 @@ //! The `RetryPolicy` struct defines the configuration for the retry behavior, including the maximum //! number of retries, initial delay, maximum delay, and jitter. //! -//! The `Sleep` trait abstracts the sleep functionality, allowing different implementations for -//! various async runtimes such as Tokio and async-std, as well as a synchronous implementation. -//! //! The `retry_with_exponential_backoff` function retries the given operation according to the //! specified retry policy, using exponential backoff and jitter to determine the delay between //! retries. The function logs errors and retries the operation until it succeeds or the maximum //! number of retries is reached. +#[cfg(feature = "experimental_async_runtime")] +use opentelemetry::otel_warn; +#[cfg(feature = "experimental_async_runtime")] use std::future::Future; -use std::pin::Pin; +#[cfg(feature = "experimental_async_runtime")] use std::time::{Duration, SystemTime}; -use opentelemetry::otel_warn; + +#[cfg(feature = "experimental_async_runtime")] +use crate::runtime::Runtime; /// Configuration for retry policy. #[derive(Debug)] @@ -29,67 +31,36 @@ pub struct RetryPolicy { pub jitter_ms: u64, } +/// A runtime stub for when experimental_async_runtime is not enabled. +/// This allows retry policy to be configured but no actual retries occur. +#[cfg(not(feature = "experimental_async_runtime"))] +#[derive(Debug, Clone)] +pub struct NoOpRuntime; + +#[cfg(not(feature = "experimental_async_runtime"))] +impl NoOpRuntime { + /// Creates a new no-op runtime. + pub fn new() -> Self { + Self + } +} + // Generates a random jitter value up to max_jitter +#[cfg(feature = "experimental_async_runtime")] fn generate_jitter(max_jitter: u64) -> u64 { let now = SystemTime::now(); - let nanos = now.duration_since(SystemTime::UNIX_EPOCH).unwrap().subsec_nanos(); + let nanos = now + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .subsec_nanos(); nanos as u64 % (max_jitter + 1) } -/// Trait to abstract the sleep functionality. -pub trait Sleep { - /// The future returned by the sleep function. - type SleepFuture: Future; - - /// Sleeps for the specified duration. - fn sleep(duration: Duration) -> Self::SleepFuture; -} - -/// Implementation of the Sleep trait for tokio::time::Sleep -#[cfg(feature = "rt-tokio")] -impl Sleep for tokio::time::Sleep { - type SleepFuture = tokio::time::Sleep; - - fn sleep(duration: Duration) -> Self::SleepFuture { - tokio::time::sleep(duration) - } -} - -#[cfg(feature = "rt-async-std")] -/// There is no direct equivalent to `tokio::time::Sleep` in `async-std`. -/// Instead, we create a new struct `AsyncStdSleep` and implement the `Sleep` -/// trait for it, boxing the future returned by `async_std::task::sleep` to fit -/// the trait's associated type requirements. -#[derive(Debug)] -pub struct AsyncStdSleep; - -/// Implementation of the Sleep trait for async-std -#[cfg(feature = "rt-async-std")] -impl Sleep for AsyncStdSleep { - type SleepFuture = Pin + Send>>; - - fn sleep(duration: Duration) -> Self::SleepFuture { - Box::pin(async_std::task::sleep(duration)) - } -} - -/// Implement the Sleep trait for synchronous sleep -#[derive(Debug)] -pub struct StdSleep; - -impl Sleep for StdSleep { - type SleepFuture = std::future::Ready<()>; - - fn sleep(duration: Duration) -> Self::SleepFuture { - std::thread::sleep(duration); - std::future::ready(()) - } -} - /// Retries the given operation with exponential backoff and jitter. /// /// # Arguments /// +/// * `runtime` - The async runtime to use for delays. /// * `policy` - The retry policy configuration. /// * `operation_name` - The name of the operation being retried. /// * `operation` - The operation to be retried. @@ -97,16 +68,18 @@ impl Sleep for StdSleep { /// # Returns /// /// A `Result` containing the operation's result or an error if the maximum retries are reached. -pub async fn retry_with_exponential_backoff( +#[cfg(feature = "experimental_async_runtime")] +pub async fn retry_with_exponential_backoff( + runtime: R, policy: RetryPolicy, operation_name: &str, mut operation: F, ) -> Result where + R: Runtime, F: FnMut() -> Fut, E: std::fmt::Debug, Fut: Future>, - S: Sleep, { let mut attempt = 0; let mut delay = policy.initial_delay_ms; @@ -120,7 +93,9 @@ where otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err)); let jitter = generate_jitter(policy.jitter_ms); let delay_with_jitter = std::cmp::min(delay + jitter, policy.max_delay_ms); - S::sleep(Duration::from_millis(delay_with_jitter)).await; + runtime + .delay(Duration::from_millis(delay_with_jitter)) + .await; delay = std::cmp::min(delay * 2, policy.max_delay_ms); // Exponential backoff } Err(err) => return Err(err), // Return the error if max retries are reached @@ -128,12 +103,30 @@ where } } -#[cfg(test)] +/// No-op retry function for when experimental_async_runtime is not enabled. +/// This function will execute the operation exactly once without any retries. +#[cfg(not(feature = "experimental_async_runtime"))] +pub async fn retry_with_exponential_backoff( + _runtime: R, + _policy: RetryPolicy, + _operation_name: &str, + mut operation: F, +) -> Result +where + F: FnMut() -> Fut, + Fut: std::future::Future>, +{ + // Without experimental_async_runtime, we just execute once without retries + operation().await +} + +#[cfg(all(test, feature = "experimental_async_runtime", feature = "rt-tokio"))] mod tests { use super::*; - use tokio::time::timeout; + use crate::runtime::Tokio; use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Duration; + use tokio::time::timeout; // Test to ensure generate_jitter returns a value within the expected range #[tokio::test] @@ -146,6 +139,7 @@ mod tests { // Test to ensure retry_with_exponential_backoff succeeds on the first attempt #[tokio::test] async fn test_retry_with_exponential_backoff_success() { + let runtime = Tokio; let policy = RetryPolicy { max_retries: 3, initial_delay_ms: 100, @@ -153,9 +147,10 @@ mod tests { jitter_ms: 100, }; - let result = retry_with_exponential_backoff::<_, _, _, _, tokio::time::Sleep>(policy, "test_operation", || { + let result = retry_with_exponential_backoff(runtime, policy, "test_operation", || { Box::pin(async { Ok::<_, ()>("success") }) - }).await; + }) + .await; assert_eq!(result, Ok("success")); } @@ -163,6 +158,7 @@ mod tests { // Test to ensure retry_with_exponential_backoff retries the operation and eventually succeeds #[tokio::test] async fn test_retry_with_exponential_backoff_retries() { + let runtime = Tokio; let policy = RetryPolicy { max_retries: 3, initial_delay_ms: 100, @@ -172,7 +168,7 @@ mod tests { let attempts = AtomicUsize::new(0); - let result = retry_with_exponential_backoff::<_, _, _, _, tokio::time::Sleep>(policy, "test_operation", || { + let result = retry_with_exponential_backoff(runtime, policy, "test_operation", || { let attempt = attempts.fetch_add(1, Ordering::SeqCst); Box::pin(async move { if attempt < 2 { @@ -181,7 +177,8 @@ mod tests { Ok::<&str, &str>("success") // Succeed on the third attempt } }) - }).await; + }) + .await; assert_eq!(result, Ok("success")); assert_eq!(attempts.load(Ordering::SeqCst), 3); // Ensure there were 3 attempts @@ -190,6 +187,7 @@ mod tests { // Test to ensure retry_with_exponential_backoff fails after max retries #[tokio::test] async fn test_retry_with_exponential_backoff_failure() { + let runtime = Tokio; let policy = RetryPolicy { max_retries: 3, initial_delay_ms: 100, @@ -199,10 +197,11 @@ mod tests { let attempts = AtomicUsize::new(0); - let result = retry_with_exponential_backoff::<_, _, _, _, tokio::time::Sleep>(policy, "test_operation", || { + let result = retry_with_exponential_backoff(runtime, policy, "test_operation", || { attempts.fetch_add(1, Ordering::SeqCst); Box::pin(async { Err::<(), _>("error") }) // Always fail - }).await; + }) + .await; assert_eq!(result, Err("error")); assert_eq!(attempts.load(Ordering::SeqCst), 4); // Ensure there were 4 attempts (initial + 3 retries) @@ -211,6 +210,7 @@ mod tests { // Test to ensure retry_with_exponential_backoff respects the timeout #[tokio::test] async fn test_retry_with_exponential_backoff_timeout() { + let runtime = Tokio; let policy = RetryPolicy { max_retries: 12, // Increase the number of retries initial_delay_ms: 100, @@ -218,10 +218,14 @@ mod tests { jitter_ms: 100, }; - let result = timeout(Duration::from_secs(1), retry_with_exponential_backoff::<_, _, _, _, tokio::time::Sleep>(policy, "test_operation", || { - Box::pin(async { Err::<(), _>("error") }) // Always fail - })).await; + let result = timeout( + Duration::from_secs(1), + retry_with_exponential_backoff(runtime, policy, "test_operation", || { + Box::pin(async { Err::<(), _>("error") }) // Always fail + }), + ) + .await; assert!(result.is_err()); // Ensure the operation times out } -} \ No newline at end of file +} From ca457ba81e2c5578b39cadbc892e32997ecbeeb5 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 12 Aug 2025 09:54:57 +0200 Subject: [PATCH 10/27] add retry to traces --- opentelemetry-otlp/src/exporter/tonic/logs.rs | 2 +- .../src/exporter/tonic/trace.rs | 86 ++++++++++++------- opentelemetry-sdk/src/retry.rs | 2 +- 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index 62d58548f0..29946128ce 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -92,7 +92,7 @@ impl LogExporter for TonicLogsClient { None => return Err(OTelSdkError::AlreadyShutdown), }; - let resource_logs = group_logs_by_resource_and_scope(&*batch, resource); + let resource_logs = group_logs_by_resource_and_scope(&batch, resource); otel_debug!(name: "TonicLogsClient.ExportStarted"); diff --git a/opentelemetry-otlp/src/exporter/tonic/trace.rs b/opentelemetry-otlp/src/exporter/tonic/trace.rs index 4378c37a04..2ce8b884a3 100644 --- a/opentelemetry-otlp/src/exporter/tonic/trace.rs +++ b/opentelemetry-otlp/src/exporter/tonic/trace.rs @@ -1,4 +1,5 @@ use core::fmt; +use std::sync::Arc; use tokio::sync::Mutex; use opentelemetry::otel_debug; @@ -15,6 +16,9 @@ use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Chann use super::BoxInterceptor; +use opentelemetry_sdk::retry::{retry_with_exponential_backoff, RetryPolicy}; +use opentelemetry_sdk::runtime::Tokio; + pub(crate) struct TonicTracesClient { inner: Option, #[allow(dead_code)] @@ -60,43 +64,63 @@ impl TonicTracesClient { impl SpanExporter for TonicTracesClient { async fn export(&self, batch: Vec) -> OTelSdkResult { - let (mut client, metadata, extensions) = match &self.inner { - Some(inner) => { - let (m, e, _) = inner - .interceptor - .lock() - .await // tokio::sync::Mutex doesn't return a poisoned error, so we can safely use the interceptor here - .call(Request::new(())) - .map_err(|e| OTelSdkError::InternalFailure(format!("error: {e:?}")))? - .into_parts(); - (inner.client.clone(), m, e) - } - None => return Err(OTelSdkError::AlreadyShutdown), + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, }; - let resource_spans = group_spans_by_resource_and_scope(batch, &self.resource); + let batch = Arc::new(batch); - otel_debug!(name: "TonicTracesClient.ExportStarted"); + retry_with_exponential_backoff(Tokio, policy, "TonicTracesClient.Export", { + let batch = Arc::clone(&batch); + let inner = &self.inner; + let resource = &self.resource; + move || { + let batch = Arc::clone(&batch); + Box::pin(async move { + let (mut client, metadata, extensions) = match inner { + Some(inner) => { + let (m, e, _) = inner + .interceptor + .lock() + .await // tokio::sync::Mutex doesn't return a poisoned error, so we can safely use the interceptor here + .call(Request::new(())) + .map_err(|e| OTelSdkError::InternalFailure(format!("error: {e:?}")))? + .into_parts(); + (inner.client.clone(), m, e) + } + None => return Err(OTelSdkError::AlreadyShutdown), + }; - let result = client - .export(Request::from_parts( - metadata, - extensions, - ExportTraceServiceRequest { resource_spans }, - )) - .await; + let resource_spans = group_spans_by_resource_and_scope((*batch).clone(), resource); - match result { - Ok(_) => { - otel_debug!(name: "TonicTracesClient.ExportSucceeded"); - Ok(()) - } - Err(e) => { - let error = e.to_string(); - otel_debug!(name: "TonicTracesClient.ExportFailed", error = &error); - Err(OTelSdkError::InternalFailure(error)) + otel_debug!(name: "TonicTracesClient.ExportStarted"); + + let result = client + .export(Request::from_parts( + metadata, + extensions, + ExportTraceServiceRequest { resource_spans }, + )) + .await; + + match result { + Ok(_) => { + otel_debug!(name: "TonicTracesClient.ExportSucceeded"); + Ok(()) + } + Err(e) => { + let error = format!("export error: {e:?}"); + otel_debug!(name: "TonicTracesClient.ExportFailed", error = &error); + Err(OTelSdkError::InternalFailure(error)) + } + } + }) } - } + }) + .await } fn shutdown(&mut self) -> OTelSdkResult { diff --git a/opentelemetry-sdk/src/retry.rs b/opentelemetry-sdk/src/retry.rs index 59e9fc0c0a..1ff347b096 100644 --- a/opentelemetry-sdk/src/retry.rs +++ b/opentelemetry-sdk/src/retry.rs @@ -34,7 +34,7 @@ pub struct RetryPolicy { /// A runtime stub for when experimental_async_runtime is not enabled. /// This allows retry policy to be configured but no actual retries occur. #[cfg(not(feature = "experimental_async_runtime"))] -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct NoOpRuntime; #[cfg(not(feature = "experimental_async_runtime"))] From 06d6effb0507dc950e21657954befaa790533adf Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 12 Aug 2025 10:02:52 +0200 Subject: [PATCH 11/27] add retry to metrics --- .../src/exporter/tonic/metrics.rs | 101 ++++++++++-------- 1 file changed, 59 insertions(+), 42 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/tonic/metrics.rs b/opentelemetry-otlp/src/exporter/tonic/metrics.rs index 13813c7305..07926366e0 100644 --- a/opentelemetry-otlp/src/exporter/tonic/metrics.rs +++ b/opentelemetry-otlp/src/exporter/tonic/metrics.rs @@ -12,6 +12,9 @@ use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Chann use super::BoxInterceptor; use crate::metric::MetricsClient; +use opentelemetry_sdk::retry::{retry_with_exponential_backoff, RetryPolicy}; +use opentelemetry_sdk::runtime::Tokio; + pub(crate) struct TonicMetricsClient { inner: Mutex>, } @@ -53,49 +56,63 @@ impl TonicMetricsClient { impl MetricsClient for TonicMetricsClient { async fn export(&self, metrics: &ResourceMetrics) -> OTelSdkResult { - let (mut client, metadata, extensions) = self - .inner - .lock() - .map_err(|e| OTelSdkError::InternalFailure(format!("Failed to acquire lock: {e:?}"))) - .and_then(|mut inner| match &mut *inner { - Some(inner) => { - let (m, e, _) = inner - .interceptor - .call(Request::new(())) - .map_err(|e| { - OTelSdkError::InternalFailure(format!( - "unexpected status while exporting {e:?}" - )) - })? - .into_parts(); - Ok((inner.client.clone(), m, e)) - } - None => Err(OTelSdkError::InternalFailure( - "exporter is already shut down".into(), - )), - })?; - - otel_debug!(name: "TonicMetricsClient.ExportStarted"); - - let result = client - .export(Request::from_parts( - metadata, - extensions, - ExportMetricsServiceRequest::from(metrics), - )) - .await; - - match result { - Ok(_) => { - otel_debug!(name: "TonicMetricsClient.ExportSucceeded"); - Ok(()) - } - Err(e) => { - let error = format!("{e:?}"); - otel_debug!(name: "TonicMetricsClient.ExportFailed", error = &error); - Err(OTelSdkError::InternalFailure(error)) + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + retry_with_exponential_backoff(Tokio, policy, "TonicMetricsClient.Export", { + let inner = &self.inner; + move || { + Box::pin(async move { + let (mut client, metadata, extensions) = inner + .lock() + .map_err(|e| OTelSdkError::InternalFailure(format!("Failed to acquire lock: {e:?}"))) + .and_then(|mut inner| match &mut *inner { + Some(inner) => { + let (m, e, _) = inner + .interceptor + .call(Request::new(())) + .map_err(|e| { + OTelSdkError::InternalFailure(format!( + "unexpected status while exporting {e:?}" + )) + })? + .into_parts(); + Ok((inner.client.clone(), m, e)) + } + None => Err(OTelSdkError::InternalFailure( + "exporter is already shut down".into(), + )), + })?; + + otel_debug!(name: "TonicMetricsClient.ExportStarted"); + + let result = client + .export(Request::from_parts( + metadata, + extensions, + ExportMetricsServiceRequest::from(metrics), + )) + .await; + + match result { + Ok(_) => { + otel_debug!(name: "TonicMetricsClient.ExportSucceeded"); + Ok(()) + } + Err(e) => { + let error = format!("export error: {e:?}"); + otel_debug!(name: "TonicMetricsClient.ExportFailed", error = &error); + Err(OTelSdkError::InternalFailure(error)) + } + } + }) } - } + }) + .await } fn shutdown(&self) -> OTelSdkResult { From 93fc942baca705a1f73c86dff95a3c0861924129 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 12 Aug 2025 10:18:36 +0200 Subject: [PATCH 12/27] handle failure hints --- opentelemetry-otlp/Cargo.toml | 3 +- .../allowed-external-types.toml | 3 + opentelemetry-otlp/src/exporter/tonic/logs.rs | 96 ++-- .../src/exporter/tonic/metrics.rs | 99 ++-- .../src/exporter/tonic/trace.rs | 99 ++-- opentelemetry-otlp/src/lib.rs | 3 + .../src/retry_classification.rs | 480 ++++++++++++++++++ opentelemetry-sdk/src/retry.rs | 299 ++++++++++- 8 files changed, 928 insertions(+), 154 deletions(-) create mode 100644 opentelemetry-otlp/src/retry_classification.rs diff --git a/opentelemetry-otlp/Cargo.toml b/opentelemetry-otlp/Cargo.toml index 3d28ea9de2..4c70fe1205 100644 --- a/opentelemetry-otlp/Cargo.toml +++ b/opentelemetry-otlp/Cargo.toml @@ -35,6 +35,7 @@ tracing = {workspace = true, optional = true} prost = { workspace = true, optional = true } tonic = { workspace = true, optional = true } +tonic-types = { workspace = true, optional = true } tokio = { workspace = true, features = ["sync", "rt"], optional = true } reqwest = { workspace = true, optional = true } @@ -69,7 +70,7 @@ serialize = ["serde", "serde_json"] default = ["http-proto", "reqwest-blocking-client", "trace", "metrics", "logs", "internal-logs"] # grpc using tonic -grpc-tonic = ["tonic", "prost", "http", "tokio", "opentelemetry-proto/gen-tonic", "opentelemetry_sdk/rt-tokio", "opentelemetry_sdk/experimental_async_runtime"] +grpc-tonic = ["tonic", "tonic-types", "prost", "http", "tokio", "opentelemetry-proto/gen-tonic", "opentelemetry_sdk/rt-tokio", "opentelemetry_sdk/experimental_async_runtime"] gzip-tonic = ["tonic/gzip"] zstd-tonic = ["tonic/zstd"] diff --git a/opentelemetry-otlp/allowed-external-types.toml b/opentelemetry-otlp/allowed-external-types.toml index 435221a5cb..4e77afead3 100644 --- a/opentelemetry-otlp/allowed-external-types.toml +++ b/opentelemetry-otlp/allowed-external-types.toml @@ -18,4 +18,7 @@ allowed_external_types = [ "tonic::transport::tls::Identity", "tonic::transport::channel::Channel", "tonic::service::interceptor::Interceptor", + + # For retries + "tonic::status::Status" ] diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index 29946128ce..279ac72d84 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -14,7 +14,8 @@ use opentelemetry_proto::transform::logs::tonic::group_logs_by_resource_and_scop use super::BoxInterceptor; -use opentelemetry_sdk::retry::{retry_with_exponential_backoff, RetryPolicy}; +use crate::retry_classification::grpc::classify_tonic_status; +use opentelemetry_sdk::retry::{retry_with_exponential_backoff_classified, RetryPolicy}; use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicLogsClient { @@ -71,54 +72,57 @@ impl LogExporter for TonicLogsClient { let batch = Arc::new(batch); - retry_with_exponential_backoff(Tokio, policy, "TonicLogsClient.Export", { - let batch = Arc::clone(&batch); - let inner = &self.inner; - let resource = &self.resource; - move || { - let batch = Arc::clone(&batch); - Box::pin(async move { - let (mut client, metadata, extensions) = match inner.lock().await.as_mut() { - Some(inner) => { - let (m, e, _) = inner - .interceptor - .call(Request::new(())) - .map_err(|e| { - OTelSdkError::InternalFailure(format!("error: {e:?}")) - })? - .into_parts(); - (inner.client.clone(), m, e) - } - None => return Err(OTelSdkError::AlreadyShutdown), - }; - - let resource_logs = group_logs_by_resource_and_scope(&batch, resource); - - otel_debug!(name: "TonicLogsClient.ExportStarted"); - - let result = client - .export(Request::from_parts( - metadata, - extensions, - ExportLogsServiceRequest { resource_logs }, + match retry_with_exponential_backoff_classified( + Tokio, + policy, + classify_tonic_status, + "TonicLogsClient.Export", + || async { + let batch_clone = Arc::clone(&batch); + + // Execute the export operation + let (mut client, metadata, extensions) = match self.inner.lock().await.as_mut() { + Some(inner) => { + let (m, e, _) = inner + .interceptor + .call(Request::new(())) + .map_err(|e| { + // Convert interceptor errors to tonic::Status for retry classification + tonic::Status::internal(format!("interceptor error: {e:?}")) + })? + .into_parts(); + (inner.client.clone(), m, e) + } + None => { + return Err(tonic::Status::failed_precondition( + "exporter already shutdown", )) - .await; - - match result { - Ok(_) => { - otel_debug!(name: "TonicLogsClient.ExportSucceeded"); - Ok(()) - } - Err(e) => { - let error = format!("export error: {e:?}"); - otel_debug!(name: "TonicLogsClient.ExportFailed", error = &error); - Err(OTelSdkError::InternalFailure(error)) - } } - }) - } - }) + }; + + let resource_logs = group_logs_by_resource_and_scope(&batch_clone, &self.resource); + + otel_debug!(name: "TonicLogsClient.ExportStarted"); + + client + .export(Request::from_parts( + metadata, + extensions, + ExportLogsServiceRequest { resource_logs }, + )) + .await + .map(|_| { + otel_debug!(name: "TonicLogsClient.ExportSucceeded"); + }) + }, + ) .await + { + Ok(_) => Ok(()), + Err(tonic_status) => Err(OTelSdkError::InternalFailure(format!( + "export error: {tonic_status:?}" + ))), + } } fn shutdown_with_timeout(&self, _timeout: time::Duration) -> OTelSdkResult { diff --git a/opentelemetry-otlp/src/exporter/tonic/metrics.rs b/opentelemetry-otlp/src/exporter/tonic/metrics.rs index 07926366e0..30a3db1ce0 100644 --- a/opentelemetry-otlp/src/exporter/tonic/metrics.rs +++ b/opentelemetry-otlp/src/exporter/tonic/metrics.rs @@ -12,7 +12,8 @@ use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Chann use super::BoxInterceptor; use crate::metric::MetricsClient; -use opentelemetry_sdk::retry::{retry_with_exponential_backoff, RetryPolicy}; +use crate::retry_classification::grpc::classify_tonic_status; +use opentelemetry_sdk::retry::{retry_with_exponential_backoff_classified, RetryPolicy}; use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicMetricsClient { @@ -63,56 +64,56 @@ impl MetricsClient for TonicMetricsClient { jitter_ms: 100, }; - retry_with_exponential_backoff(Tokio, policy, "TonicMetricsClient.Export", { - let inner = &self.inner; - move || { - Box::pin(async move { - let (mut client, metadata, extensions) = inner - .lock() - .map_err(|e| OTelSdkError::InternalFailure(format!("Failed to acquire lock: {e:?}"))) - .and_then(|mut inner| match &mut *inner { - Some(inner) => { - let (m, e, _) = inner - .interceptor - .call(Request::new(())) - .map_err(|e| { - OTelSdkError::InternalFailure(format!( - "unexpected status while exporting {e:?}" - )) - })? - .into_parts(); - Ok((inner.client.clone(), m, e)) - } - None => Err(OTelSdkError::InternalFailure( - "exporter is already shut down".into(), - )), - })?; - - otel_debug!(name: "TonicMetricsClient.ExportStarted"); - - let result = client - .export(Request::from_parts( - metadata, - extensions, - ExportMetricsServiceRequest::from(metrics), - )) - .await; - - match result { - Ok(_) => { - otel_debug!(name: "TonicMetricsClient.ExportSucceeded"); - Ok(()) + match retry_with_exponential_backoff_classified( + Tokio, + policy, + classify_tonic_status, + "TonicMetricsClient.Export", + || async { + // Execute the export operation + let (mut client, metadata, extensions) = self + .inner + .lock() + .map_err(|e| tonic::Status::internal(format!("Failed to acquire lock: {e:?}"))) + .and_then(|mut inner| match &mut *inner { + Some(inner) => { + let (m, e, _) = inner + .interceptor + .call(Request::new(())) + .map_err(|e| { + tonic::Status::internal(format!( + "unexpected status while exporting {e:?}" + )) + })? + .into_parts(); + Ok((inner.client.clone(), m, e)) } - Err(e) => { - let error = format!("export error: {e:?}"); - otel_debug!(name: "TonicMetricsClient.ExportFailed", error = &error); - Err(OTelSdkError::InternalFailure(error)) - } - } - }) - } - }) + None => Err(tonic::Status::failed_precondition( + "exporter is already shut down", + )), + })?; + + otel_debug!(name: "TonicMetricsClient.ExportStarted"); + + client + .export(Request::from_parts( + metadata, + extensions, + ExportMetricsServiceRequest::from(metrics), + )) + .await + .map(|_| { + otel_debug!(name: "TonicMetricsClient.ExportSucceeded"); + }) + }, + ) .await + { + Ok(_) => Ok(()), + Err(tonic_status) => Err(OTelSdkError::InternalFailure(format!( + "export error: {tonic_status:?}" + ))), + } } fn shutdown(&self) -> OTelSdkResult { diff --git a/opentelemetry-otlp/src/exporter/tonic/trace.rs b/opentelemetry-otlp/src/exporter/tonic/trace.rs index 2ce8b884a3..5cc27ab188 100644 --- a/opentelemetry-otlp/src/exporter/tonic/trace.rs +++ b/opentelemetry-otlp/src/exporter/tonic/trace.rs @@ -16,7 +16,8 @@ use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Chann use super::BoxInterceptor; -use opentelemetry_sdk::retry::{retry_with_exponential_backoff, RetryPolicy}; +use crate::retry_classification::grpc::classify_tonic_status; +use opentelemetry_sdk::retry::{retry_with_exponential_backoff_classified, RetryPolicy}; use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicTracesClient { @@ -73,54 +74,60 @@ impl SpanExporter for TonicTracesClient { let batch = Arc::new(batch); - retry_with_exponential_backoff(Tokio, policy, "TonicTracesClient.Export", { - let batch = Arc::clone(&batch); - let inner = &self.inner; - let resource = &self.resource; - move || { - let batch = Arc::clone(&batch); - Box::pin(async move { - let (mut client, metadata, extensions) = match inner { - Some(inner) => { - let (m, e, _) = inner - .interceptor - .lock() - .await // tokio::sync::Mutex doesn't return a poisoned error, so we can safely use the interceptor here - .call(Request::new(())) - .map_err(|e| OTelSdkError::InternalFailure(format!("error: {e:?}")))? - .into_parts(); - (inner.client.clone(), m, e) - } - None => return Err(OTelSdkError::AlreadyShutdown), - }; - - let resource_spans = group_spans_by_resource_and_scope((*batch).clone(), resource); - - otel_debug!(name: "TonicTracesClient.ExportStarted"); - - let result = client - .export(Request::from_parts( - metadata, - extensions, - ExportTraceServiceRequest { resource_spans }, + match retry_with_exponential_backoff_classified( + Tokio, + policy, + classify_tonic_status, + "TonicTracesClient.Export", + || async { + let batch_clone = Arc::clone(&batch); + + // Execute the export operation + let (mut client, metadata, extensions) = match &self.inner { + Some(inner) => { + let (m, e, _) = inner + .interceptor + .lock() + .await // tokio::sync::Mutex doesn't return a poisoned error, so we can safely use the interceptor here + .call(Request::new(())) + .map_err(|e| { + // Convert interceptor errors to tonic::Status for retry classification + tonic::Status::internal(format!("interceptor error: {e:?}")) + })? + .into_parts(); + (inner.client.clone(), m, e) + } + None => { + return Err(tonic::Status::failed_precondition( + "exporter already shutdown", )) - .await; - - match result { - Ok(_) => { - otel_debug!(name: "TonicTracesClient.ExportSucceeded"); - Ok(()) - } - Err(e) => { - let error = format!("export error: {e:?}"); - otel_debug!(name: "TonicTracesClient.ExportFailed", error = &error); - Err(OTelSdkError::InternalFailure(error)) - } } - }) - } - }) + }; + + let resource_spans = + group_spans_by_resource_and_scope((*batch_clone).clone(), &self.resource); + + otel_debug!(name: "TonicTracesClient.ExportStarted"); + + client + .export(Request::from_parts( + metadata, + extensions, + ExportTraceServiceRequest { resource_spans }, + )) + .await + .map(|_| { + otel_debug!(name: "TonicTracesClient.ExportSucceeded"); + }) + }, + ) .await + { + Ok(_) => Ok(()), + Err(tonic_status) => Err(OTelSdkError::InternalFailure(format!( + "export error: {tonic_status:?}" + ))), + } } fn shutdown(&mut self) -> OTelSdkResult { diff --git a/opentelemetry-otlp/src/lib.rs b/opentelemetry-otlp/src/lib.rs index 3474b2b590..9e21ffe6b4 100644 --- a/opentelemetry-otlp/src/lib.rs +++ b/opentelemetry-otlp/src/lib.rs @@ -366,6 +366,9 @@ mod metric; #[cfg(any(feature = "http-proto", feature = "http-json", feature = "grpc-tonic"))] mod span; +#[cfg(feature = "grpc-tonic")] +pub mod retry_classification; + pub use crate::exporter::Compression; pub use crate::exporter::ExportConfig; pub use crate::exporter::ExporterBuildError; diff --git a/opentelemetry-otlp/src/retry_classification.rs b/opentelemetry-otlp/src/retry_classification.rs new file mode 100644 index 0000000000..63a395bc3e --- /dev/null +++ b/opentelemetry-otlp/src/retry_classification.rs @@ -0,0 +1,480 @@ +//! Error classification for OTLP exporters with protocol-specific throttling support. +//! +//! This module provides error classification functions for HTTP and gRPC protocols, +//! supporting server-provided throttling hints like HTTP Retry-After headers and +//! gRPC RetryInfo metadata. + +use opentelemetry_sdk::retry::RetryErrorType; +use std::time::Duration; + +#[cfg(feature = "grpc-tonic")] +use tonic; + +#[cfg(feature = "grpc-tonic")] +use tonic_types::StatusExt; + +/// HTTP-specific error classification with Retry-After header support. +pub mod http { + use super::*; + + /// Classifies HTTP errors based on status code and headers. + /// + /// # Arguments + /// * `status_code` - HTTP status code + /// * `retry_after_header` - Value of the Retry-After header, if present + /// + /// # Retry-After Header Formats + /// * Seconds: "120" + /// * HTTP Date: "Fri, 31 Dec 1999 23:59:59 GMT" + pub fn classify_http_error( + status_code: u16, + retry_after_header: Option<&str>, + ) -> RetryErrorType { + match status_code { + // 429 Too Many Requests - check for Retry-After + 429 => { + if let Some(retry_after) = retry_after_header { + if let Some(duration) = parse_retry_after(retry_after) { + return RetryErrorType::Throttled(duration); + } + } + // Fallback to retryable if no valid Retry-After + RetryErrorType::Retryable + } + // 5xx Server errors - retryable + 500..=599 => RetryErrorType::Retryable, + // 4xx Client errors (except 429) - not retryable + 400..=499 => RetryErrorType::NonRetryable, + // Other codes - retryable (network issues, etc.) + _ => RetryErrorType::Retryable, + } + } + + /// Parses the Retry-After header value. + /// + /// Supports both formats: + /// - Delay seconds: "120" + /// - HTTP date: "Fri, 31 Dec 1999 23:59:59 GMT" + /// + /// Returns None if parsing fails or delay is unreasonable. + fn parse_retry_after(retry_after: &str) -> Option { + // Try parsing as seconds first + if let Ok(seconds) = retry_after.trim().parse::() { + // Cap at 10 minutes for safety + let capped_seconds = seconds.min(600); + return Some(Duration::from_secs(capped_seconds)); + } + + // Try parsing as HTTP date + if let Ok(delay_seconds) = parse_http_date_to_delay(retry_after) { + // Cap at 10 minutes for safety + let capped_seconds = delay_seconds.min(600); + return Some(Duration::from_secs(capped_seconds)); + } + + None + } + + /// Parses HTTP date format and returns delay in seconds from now. + /// + /// This is a simplified parser for the most common HTTP date format. + /// In production, you might want to use a proper HTTP date parsing library. + fn parse_http_date_to_delay(date_str: &str) -> Result { + // For now, return error - would need proper HTTP date parsing + // This could be implemented with chrono or similar + let _ = date_str; + Err(()) + } +} + +/// gRPC-specific error classification with RetryInfo support. +pub mod grpc { + use super::*; + + /// Classifies a tonic::Status error + #[cfg(feature = "grpc-tonic")] + pub fn classify_tonic_status(status: &tonic::Status) -> RetryErrorType { + // Use tonic-types to extract RetryInfo - this is the proper way! + let retry_info_seconds = status + .get_details_retry_info() + .and_then(|retry_info| retry_info.retry_delay) + .map(|duration| duration.as_secs()); + + classify_grpc_error(status.code(), retry_info_seconds) + } + + /// Classifies gRPC errors based on status code and metadata. + /// + /// Implements the OpenTelemetry OTLP specification for error handling: + /// https://opentelemetry.io/docs/specs/otlp/ + /// https://github.com/open-telemetry/opentelemetry-proto/blob/main/docs/specification.md#failures + /// + /// # Arguments + /// * `grpc_code` - gRPC status code as tonic::Code enum + /// * `retry_info_seconds` - Parsed retry delay from RetryInfo metadata, if present + fn classify_grpc_error( + grpc_code: tonic::Code, + retry_info_seconds: Option, + ) -> RetryErrorType { + match grpc_code { + // RESOURCE_EXHAUSTED: Special case per OTLP spec + // Retryable only if server provides RetryInfo indicating recovery is possible + tonic::Code::ResourceExhausted => { + if let Some(seconds) = retry_info_seconds { + // Server signals recovery is possible - use throttled retry + let capped_seconds = seconds.min(600); // Cap at 10 minutes for safety + return RetryErrorType::Throttled(std::time::Duration::from_secs( + capped_seconds, + )); + } + // No RetryInfo - treat as non-retryable per OTLP spec + RetryErrorType::NonRetryable + } + + // Retryable errors per OTLP specification + tonic::Code::Cancelled + | tonic::Code::DeadlineExceeded + | tonic::Code::Aborted + | tonic::Code::OutOfRange + | tonic::Code::Unavailable + | tonic::Code::DataLoss => RetryErrorType::Retryable, + + // Non-retryable errors per OTLP specification + tonic::Code::Unknown + | tonic::Code::InvalidArgument + | tonic::Code::NotFound + | tonic::Code::AlreadyExists + | tonic::Code::PermissionDenied + | tonic::Code::FailedPrecondition + | tonic::Code::Unimplemented + | tonic::Code::Internal + | tonic::Code::Unauthenticated => RetryErrorType::NonRetryable, + + // OK should never reach here in error scenarios, but handle gracefully + tonic::Code::Ok => RetryErrorType::NonRetryable, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // Tests for HTTP error classification + mod http_tests { + use super::*; + use crate::retry_classification::http::*; + + #[test] + fn test_http_429_with_retry_after_seconds() { + let result = classify_http_error(429, Some("30")); + assert_eq!(result, RetryErrorType::Throttled(Duration::from_secs(30))); + } + + #[test] + fn test_http_429_with_large_retry_after_capped() { + let result = classify_http_error(429, Some("900")); // 15 minutes + assert_eq!( + result, + RetryErrorType::Throttled(std::time::Duration::from_secs(600)) + ); // Capped at 10 minutes + } + + #[test] + fn test_http_429_with_invalid_retry_after() { + let result = classify_http_error(429, Some("invalid")); + assert_eq!(result, RetryErrorType::Retryable); // Fallback + } + + #[test] + fn test_http_429_without_retry_after() { + let result = classify_http_error(429, None); + assert_eq!(result, RetryErrorType::Retryable); // Fallback + } + + #[test] + fn test_http_5xx_errors() { + assert_eq!(classify_http_error(500, None), RetryErrorType::Retryable); + assert_eq!(classify_http_error(502, None), RetryErrorType::Retryable); + assert_eq!(classify_http_error(503, None), RetryErrorType::Retryable); + assert_eq!(classify_http_error(599, None), RetryErrorType::Retryable); + } + + #[test] + fn test_http_4xx_errors() { + assert_eq!(classify_http_error(400, None), RetryErrorType::NonRetryable); + assert_eq!(classify_http_error(401, None), RetryErrorType::NonRetryable); + assert_eq!(classify_http_error(403, None), RetryErrorType::NonRetryable); + assert_eq!(classify_http_error(404, None), RetryErrorType::NonRetryable); + assert_eq!(classify_http_error(499, None), RetryErrorType::NonRetryable); + } + + #[test] + fn test_http_other_errors() { + assert_eq!(classify_http_error(100, None), RetryErrorType::Retryable); + assert_eq!(classify_http_error(200, None), RetryErrorType::Retryable); + assert_eq!(classify_http_error(300, None), RetryErrorType::Retryable); + } + } + + // Tests for gRPC error classification using public interface + #[cfg(feature = "grpc-tonic")] + mod grpc_tests { + use crate::retry_classification::grpc::classify_tonic_status; + use opentelemetry_sdk::retry::RetryErrorType; + use tonic_types::{ErrorDetails, StatusExt}; + + #[test] + fn test_grpc_resource_exhausted_with_retry_info() { + let error_details = + ErrorDetails::with_retry_info(Some(std::time::Duration::from_secs(45))); + let status = tonic::Status::with_error_details( + tonic::Code::ResourceExhausted, + "rate limited", + error_details, + ); + let result = classify_tonic_status(&status); + assert_eq!( + result, + RetryErrorType::Throttled(std::time::Duration::from_secs(45)) + ); + } + + #[test] + fn test_grpc_resource_exhausted_with_large_retry_info_capped() { + let error_details = + ErrorDetails::with_retry_info(Some(std::time::Duration::from_secs(900))); // 15 minutes + let status = tonic::Status::with_error_details( + tonic::Code::ResourceExhausted, + "rate limited", + error_details, + ); + let result = classify_tonic_status(&status); + assert_eq!( + result, + RetryErrorType::Throttled(std::time::Duration::from_secs(600)) + ); // Capped at 10 minutes + } + + #[test] + fn test_grpc_resource_exhausted_without_retry_info() { + let status = tonic::Status::new(tonic::Code::ResourceExhausted, "rate limited"); + let result = classify_tonic_status(&status); + // Per OTLP spec: RESOURCE_EXHAUSTED without RetryInfo is non-retryable + assert_eq!(result, RetryErrorType::NonRetryable); + } + + #[test] + fn test_grpc_retryable_errors() { + // Test all retryable errors per OTLP specification + let cancelled = tonic::Status::new(tonic::Code::Cancelled, "cancelled"); + assert_eq!(classify_tonic_status(&cancelled), RetryErrorType::Retryable); + + let deadline_exceeded = + tonic::Status::new(tonic::Code::DeadlineExceeded, "deadline exceeded"); + assert_eq!( + classify_tonic_status(&deadline_exceeded), + RetryErrorType::Retryable + ); + + let aborted = tonic::Status::new(tonic::Code::Aborted, "aborted"); + assert_eq!(classify_tonic_status(&aborted), RetryErrorType::Retryable); + + let out_of_range = tonic::Status::new(tonic::Code::OutOfRange, "out of range"); + assert_eq!( + classify_tonic_status(&out_of_range), + RetryErrorType::Retryable + ); + + let unavailable = tonic::Status::new(tonic::Code::Unavailable, "unavailable"); + assert_eq!( + classify_tonic_status(&unavailable), + RetryErrorType::Retryable + ); + + let data_loss = tonic::Status::new(tonic::Code::DataLoss, "data loss"); + assert_eq!(classify_tonic_status(&data_loss), RetryErrorType::Retryable); + } + + #[test] + fn test_grpc_non_retryable_errors() { + // Test all non-retryable errors per OTLP specification + let unknown = tonic::Status::new(tonic::Code::Unknown, "unknown"); + assert_eq!( + classify_tonic_status(&unknown), + RetryErrorType::NonRetryable + ); + + let invalid_argument = + tonic::Status::new(tonic::Code::InvalidArgument, "invalid argument"); + assert_eq!( + classify_tonic_status(&invalid_argument), + RetryErrorType::NonRetryable + ); + + let not_found = tonic::Status::new(tonic::Code::NotFound, "not found"); + assert_eq!( + classify_tonic_status(¬_found), + RetryErrorType::NonRetryable + ); + + let already_exists = tonic::Status::new(tonic::Code::AlreadyExists, "already exists"); + assert_eq!( + classify_tonic_status(&already_exists), + RetryErrorType::NonRetryable + ); + + let permission_denied = + tonic::Status::new(tonic::Code::PermissionDenied, "permission denied"); + assert_eq!( + classify_tonic_status(&permission_denied), + RetryErrorType::NonRetryable + ); + + let failed_precondition = + tonic::Status::new(tonic::Code::FailedPrecondition, "failed precondition"); + assert_eq!( + classify_tonic_status(&failed_precondition), + RetryErrorType::NonRetryable + ); + + let unimplemented = tonic::Status::new(tonic::Code::Unimplemented, "unimplemented"); + assert_eq!( + classify_tonic_status(&unimplemented), + RetryErrorType::NonRetryable + ); + + let internal = tonic::Status::new(tonic::Code::Internal, "internal error"); + assert_eq!( + classify_tonic_status(&internal), + RetryErrorType::NonRetryable + ); + + let unauthenticated = + tonic::Status::new(tonic::Code::Unauthenticated, "unauthenticated"); + assert_eq!( + classify_tonic_status(&unauthenticated), + RetryErrorType::NonRetryable + ); + } + + #[test] + fn test_grpc_ok_code_handled() { + // OK status should be handled gracefully (though unlikely in error scenarios) + let ok = tonic::Status::new(tonic::Code::Ok, "success"); + assert_eq!(classify_tonic_status(&ok), RetryErrorType::NonRetryable); + } + + // Tests for tonic-types RetryInfo integration + #[cfg(feature = "grpc-tonic")] + mod retry_info_tests { + use super::*; + use crate::retry_classification::grpc::classify_tonic_status; + use tonic_types::{ErrorDetails, StatusExt}; + + #[test] + fn test_classify_status_with_retry_info() { + // Create a tonic::Status with RetryInfo using proper StatusExt API + let error_details = + ErrorDetails::with_retry_info(Some(std::time::Duration::from_secs(30))); + let status = tonic::Status::with_error_details( + tonic::Code::ResourceExhausted, + "rate limited", + error_details, + ); + + // Test classification + let result = classify_tonic_status(&status); + assert_eq!( + result, + RetryErrorType::Throttled(std::time::Duration::from_secs(30)) + ); + } + + #[test] + fn test_classify_status_with_fractional_retry_info() { + // Create a tonic::Status with fractional seconds RetryInfo + let error_details = + ErrorDetails::with_retry_info(Some(std::time::Duration::from_millis(5500))); // 5.5 seconds + let status = tonic::Status::with_error_details( + tonic::Code::ResourceExhausted, + "rate limited", + error_details, + ); + + // Should use exact duration (5.5s = 5s) + let result = classify_tonic_status(&status); + assert_eq!( + result, + RetryErrorType::Throttled(std::time::Duration::from_secs(5)) + ); + } + + #[test] + fn test_classify_status_without_retry_info() { + // Status with resource_exhausted but no RetryInfo + let status = tonic::Status::new(tonic::Code::ResourceExhausted, "rate limited"); + + // Per OTLP spec: should be non-retryable without RetryInfo + let result = classify_tonic_status(&status); + assert_eq!(result, RetryErrorType::NonRetryable); + } + + #[test] + fn test_classify_status_non_retryable_error() { + // Status with non-retryable error code + let status = tonic::Status::new(tonic::Code::InvalidArgument, "bad request"); + + let result = classify_tonic_status(&status); + assert_eq!(result, RetryErrorType::NonRetryable); + } + + #[test] + fn test_classify_status_retryable_error() { + // Status with retryable error code + let status = tonic::Status::new(tonic::Code::Unavailable, "service unavailable"); + + let result = classify_tonic_status(&status); + assert_eq!(result, RetryErrorType::Retryable); + } + + #[test] + fn test_classify_status_large_retry_delay() { + // Test with large retry delay - should be capped at 10 minutes + let error_details = + ErrorDetails::with_retry_info(Some(std::time::Duration::from_secs(3600))); // 1 hour + let status = tonic::Status::with_error_details( + tonic::Code::ResourceExhausted, + "rate limited", + error_details, + ); + + let result = classify_tonic_status(&status); + // Should be capped at 10 minutes (600 seconds) + assert_eq!( + result, + RetryErrorType::Throttled(std::time::Duration::from_secs(600)) + ); + } + + #[test] + fn test_status_ext_get_details() { + // Test that StatusExt works correctly + let error_details = + ErrorDetails::with_retry_info(Some(std::time::Duration::from_secs(45))); + let status = tonic::Status::with_error_details( + tonic::Code::ResourceExhausted, + "rate limited", + error_details, + ); + + // Direct extraction should work + let extracted = status.get_details_retry_info(); + assert!(extracted.is_some()); + + let retry_delay = extracted.unwrap().retry_delay; + assert_eq!(retry_delay, Some(std::time::Duration::from_secs(45))); + } + } + } +} diff --git a/opentelemetry-sdk/src/retry.rs b/opentelemetry-sdk/src/retry.rs index 1ff347b096..a46e91f628 100644 --- a/opentelemetry-sdk/src/retry.rs +++ b/opentelemetry-sdk/src/retry.rs @@ -12,12 +12,25 @@ use opentelemetry::otel_warn; #[cfg(feature = "experimental_async_runtime")] use std::future::Future; +use std::time::Duration; #[cfg(feature = "experimental_async_runtime")] -use std::time::{Duration, SystemTime}; +use std::time::SystemTime; #[cfg(feature = "experimental_async_runtime")] use crate::runtime::Runtime; +/// Classification of errors for retry purposes. +#[derive(Debug, Clone, PartialEq)] +pub enum RetryErrorType { + /// Error is not retryable (e.g., authentication failure, bad request). + NonRetryable, + /// Error is retryable with exponential backoff (e.g., server error, network timeout). + Retryable, + /// Error indicates throttling - wait for the specified duration before retrying. + /// This overrides exponential backoff timing. + Throttled(Duration), +} + /// Configuration for retry policy. #[derive(Debug)] pub struct RetryPolicy { @@ -73,6 +86,50 @@ pub async fn retry_with_exponential_backoff( runtime: R, policy: RetryPolicy, operation_name: &str, + operation: F, +) -> Result +where + R: Runtime, + F: FnMut() -> Fut, + E: std::fmt::Debug, + Fut: Future>, +{ + // Use a simple classifier that treats all errors as retryable + let simple_classifier = |_: &E| RetryErrorType::Retryable; + + retry_with_exponential_backoff_classified( + runtime, + policy, + simple_classifier, + operation_name, + operation, + ) + .await +} + +/// Enhanced retry with exponential backoff, jitter, and error classification. +/// +/// This function provides sophisticated retry behavior by classifying errors +/// and honoring server-provided throttling hints (e.g., HTTP Retry-After, gRPC RetryInfo). +/// +/// # Arguments +/// +/// * `runtime` - The async runtime to use for delays. +/// * `policy` - The retry policy configuration. +/// * `error_classifier` - Function to classify errors for retry decisions. +/// * `operation_name` - The name of the operation being retried. +/// * `operation` - The operation to be retried. +/// +/// # Returns +/// +/// A `Result` containing the operation's result or an error if max retries are reached +/// or a non-retryable error occurs. +#[cfg(feature = "experimental_async_runtime")] +pub async fn retry_with_exponential_backoff_classified( + runtime: R, + policy: RetryPolicy, + error_classifier: C, + operation_name: &str, mut operation: F, ) -> Result where @@ -80,6 +137,7 @@ where F: FnMut() -> Fut, E: std::fmt::Debug, Fut: Future>, + C: Fn(&E) -> RetryErrorType, { let mut attempt = 0; let mut delay = policy.initial_delay_ms; @@ -87,18 +145,40 @@ where loop { match operation().await { Ok(result) => return Ok(result), // Return the result if the operation succeeds - Err(err) if attempt < policy.max_retries => { - attempt += 1; - // Log the error and retry after a delay with jitter - otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err)); - let jitter = generate_jitter(policy.jitter_ms); - let delay_with_jitter = std::cmp::min(delay + jitter, policy.max_delay_ms); - runtime - .delay(Duration::from_millis(delay_with_jitter)) - .await; - delay = std::cmp::min(delay * 2, policy.max_delay_ms); // Exponential backoff + Err(err) => { + // Classify the error + let error_type = error_classifier(&err); + + match error_type { + RetryErrorType::NonRetryable => { + otel_warn!(name: "OtlpRetry", message = format!("Operation {:?} failed with non-retryable error: {:?}", operation_name, err)); + return Err(err); + } + RetryErrorType::Retryable if attempt < policy.max_retries => { + attempt += 1; + // Use exponential backoff with jitter + otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to retryable error: {:?}", operation_name, err)); + let jitter = generate_jitter(policy.jitter_ms); + let delay_with_jitter = std::cmp::min(delay + jitter, policy.max_delay_ms); + runtime + .delay(Duration::from_millis(delay_with_jitter)) + .await; + delay = std::cmp::min(delay * 2, policy.max_delay_ms); // Exponential backoff + } + RetryErrorType::Throttled(server_delay) if attempt < policy.max_retries => { + attempt += 1; + // Use server-specified delay (overrides exponential backoff) + otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} after server-specified throttling delay: {:?}", operation_name, server_delay)); + runtime.delay(server_delay).await; + // Don't update exponential backoff delay for next attempt since server provided specific timing + } + _ => { + // Max retries reached + otel_warn!(name: "OtlpRetry", message = format!("Operation {:?} failed after {} attempts: {:?}", operation_name, attempt, err)); + return Err(err); + } + } } - Err(err) => return Err(err), // Return the error if max retries are reached } } } @@ -228,4 +308,199 @@ mod tests { assert!(result.is_err()); // Ensure the operation times out } + + // Tests for error classification (Phase 1) + #[test] + fn test_retry_error_type_equality() { + assert_eq!(RetryErrorType::NonRetryable, RetryErrorType::NonRetryable); + assert_eq!(RetryErrorType::Retryable, RetryErrorType::Retryable); + assert_eq!( + RetryErrorType::Throttled(Duration::from_secs(30)), + RetryErrorType::Throttled(Duration::from_secs(30)) + ); + assert_ne!(RetryErrorType::Retryable, RetryErrorType::NonRetryable); + } + + // Tests for enhanced retry function (Phase 3) + #[tokio::test] + async fn test_retry_with_throttling_non_retryable_error() { + let runtime = Tokio; + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let attempts = AtomicUsize::new(0); + + // Classifier that returns non-retryable + let classifier = |_: &()| RetryErrorType::NonRetryable; + + let result = retry_with_exponential_backoff_classified( + runtime, + policy, + classifier, + "test_operation", + || { + attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async { Err::<(), _>(()) }) // Always fail + }, + ) + .await; + + assert!(result.is_err()); + assert_eq!(attempts.load(Ordering::SeqCst), 1); // Should only try once + } + + #[tokio::test] + async fn test_retry_with_throttling_retryable_error() { + let runtime = Tokio; + let policy = RetryPolicy { + max_retries: 2, + initial_delay_ms: 10, // Short delay for test + max_delay_ms: 100, + jitter_ms: 5, + }; + + let attempts = AtomicUsize::new(0); + + // Classifier that returns retryable + let classifier = |_: &()| RetryErrorType::Retryable; + + let result = retry_with_exponential_backoff_classified( + runtime, + policy, + classifier, + "test_operation", + || { + let attempt = attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async move { + if attempt < 1 { + Err::<&str, ()>(()) // Fail first attempt + } else { + Ok("success") // Succeed on retry + } + }) + }, + ) + .await; + + assert_eq!(result, Ok("success")); + assert_eq!(attempts.load(Ordering::SeqCst), 2); // Should try twice + } + + #[tokio::test] + async fn test_retry_with_throttling_throttled_error() { + let runtime = Tokio; + let policy = RetryPolicy { + max_retries: 2, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let attempts = AtomicUsize::new(0); + + // Classifier that returns throttled with short delay + let classifier = |_: &()| RetryErrorType::Throttled(Duration::from_millis(10)); + + let start_time = std::time::Instant::now(); + + let result = retry_with_exponential_backoff_classified( + runtime, + policy, + classifier, + "test_operation", + || { + let attempt = attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async move { + if attempt < 1 { + Err::<&str, ()>(()) // Fail first attempt (will be throttled) + } else { + Ok("success") // Succeed on retry + } + }) + }, + ) + .await; + + let elapsed = start_time.elapsed(); + + assert_eq!(result, Ok("success")); + assert_eq!(attempts.load(Ordering::SeqCst), 2); // Should try twice + assert!(elapsed >= Duration::from_millis(10)); // Should have waited for throttle delay + } + + #[tokio::test] + async fn test_retry_with_throttling_max_attempts_exceeded() { + let runtime = Tokio; + let policy = RetryPolicy { + max_retries: 1, // Only 1 retry + initial_delay_ms: 10, + max_delay_ms: 100, + jitter_ms: 5, + }; + + let attempts = AtomicUsize::new(0); + + // Classifier that returns retryable + let classifier = |_: &()| RetryErrorType::Retryable; + + let result = retry_with_exponential_backoff_classified( + runtime, + policy, + classifier, + "test_operation", + || { + attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async { Err::<(), _>(()) }) // Always fail + }, + ) + .await; + + assert!(result.is_err()); + assert_eq!(attempts.load(Ordering::SeqCst), 2); // Initial attempt + 1 retry + } + + #[tokio::test] + async fn test_retry_with_throttling_mixed_error_types() { + let runtime = Tokio; + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 10, + max_delay_ms: 100, + jitter_ms: 5, + }; + + let attempts = AtomicUsize::new(0); + + // Classifier that returns different types based on attempt number + let classifier = |err: &usize| match *err { + 0 => RetryErrorType::Retryable, + 1 => RetryErrorType::Throttled(Duration::from_millis(10)), + _ => RetryErrorType::Retryable, + }; + + let result = retry_with_exponential_backoff_classified( + runtime, + policy, + classifier, + "test_operation", + || { + let attempt = attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async move { + if attempt < 2 { + Err(attempt) // Return attempt number as error + } else { + Ok("success") // Succeed on third attempt + } + }) + }, + ) + .await; + + assert_eq!(result, Ok("success")); + assert_eq!(attempts.load(Ordering::SeqCst), 3); // Should try three times + } } From 30fdb4f464613e0e8bfb14245a0687d07206d487 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 12 Aug 2025 12:25:59 +0200 Subject: [PATCH 13/27] simplify retry interface --- opentelemetry-otlp/src/exporter/tonic/logs.rs | 4 +- .../src/exporter/tonic/metrics.rs | 4 +- .../src/exporter/tonic/trace.rs | 4 +- opentelemetry-sdk/src/retry.rs | 219 +++++++----------- 4 files changed, 93 insertions(+), 138 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index 279ac72d84..fee1244686 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -15,7 +15,7 @@ use opentelemetry_proto::transform::logs::tonic::group_logs_by_resource_and_scop use super::BoxInterceptor; use crate::retry_classification::grpc::classify_tonic_status; -use opentelemetry_sdk::retry::{retry_with_exponential_backoff_classified, RetryPolicy}; +use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicLogsClient { @@ -72,7 +72,7 @@ impl LogExporter for TonicLogsClient { let batch = Arc::new(batch); - match retry_with_exponential_backoff_classified( + match retry_with_backoff( Tokio, policy, classify_tonic_status, diff --git a/opentelemetry-otlp/src/exporter/tonic/metrics.rs b/opentelemetry-otlp/src/exporter/tonic/metrics.rs index 30a3db1ce0..d6244b4bae 100644 --- a/opentelemetry-otlp/src/exporter/tonic/metrics.rs +++ b/opentelemetry-otlp/src/exporter/tonic/metrics.rs @@ -13,7 +13,7 @@ use super::BoxInterceptor; use crate::metric::MetricsClient; use crate::retry_classification::grpc::classify_tonic_status; -use opentelemetry_sdk::retry::{retry_with_exponential_backoff_classified, RetryPolicy}; +use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicMetricsClient { @@ -64,7 +64,7 @@ impl MetricsClient for TonicMetricsClient { jitter_ms: 100, }; - match retry_with_exponential_backoff_classified( + match retry_with_backoff( Tokio, policy, classify_tonic_status, diff --git a/opentelemetry-otlp/src/exporter/tonic/trace.rs b/opentelemetry-otlp/src/exporter/tonic/trace.rs index 5cc27ab188..a5239b4602 100644 --- a/opentelemetry-otlp/src/exporter/tonic/trace.rs +++ b/opentelemetry-otlp/src/exporter/tonic/trace.rs @@ -17,7 +17,7 @@ use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Chann use super::BoxInterceptor; use crate::retry_classification::grpc::classify_tonic_status; -use opentelemetry_sdk::retry::{retry_with_exponential_backoff_classified, RetryPolicy}; +use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicTracesClient { @@ -74,7 +74,7 @@ impl SpanExporter for TonicTracesClient { let batch = Arc::new(batch); - match retry_with_exponential_backoff_classified( + match retry_with_backoff( Tokio, policy, classify_tonic_status, diff --git a/opentelemetry-sdk/src/retry.rs b/opentelemetry-sdk/src/retry.rs index a46e91f628..48f4cf49a7 100644 --- a/opentelemetry-sdk/src/retry.rs +++ b/opentelemetry-sdk/src/retry.rs @@ -3,10 +3,10 @@ //! The `RetryPolicy` struct defines the configuration for the retry behavior, including the maximum //! number of retries, initial delay, maximum delay, and jitter. //! -//! The `retry_with_exponential_backoff` function retries the given operation according to the +//! The `retry_with_backoff` function retries the given operation according to the //! specified retry policy, using exponential backoff and jitter to determine the delay between -//! retries. The function logs errors and retries the operation until it succeeds or the maximum -//! number of retries is reached. +//! retries. The function uses error classification to determine retry behavior and can honor +//! server-provided throttling hints. #[cfg(feature = "experimental_async_runtime")] use opentelemetry::otel_warn; @@ -69,45 +69,7 @@ fn generate_jitter(max_jitter: u64) -> u64 { nanos as u64 % (max_jitter + 1) } -/// Retries the given operation with exponential backoff and jitter. -/// -/// # Arguments -/// -/// * `runtime` - The async runtime to use for delays. -/// * `policy` - The retry policy configuration. -/// * `operation_name` - The name of the operation being retried. -/// * `operation` - The operation to be retried. -/// -/// # Returns -/// -/// A `Result` containing the operation's result or an error if the maximum retries are reached. -#[cfg(feature = "experimental_async_runtime")] -pub async fn retry_with_exponential_backoff( - runtime: R, - policy: RetryPolicy, - operation_name: &str, - operation: F, -) -> Result -where - R: Runtime, - F: FnMut() -> Fut, - E: std::fmt::Debug, - Fut: Future>, -{ - // Use a simple classifier that treats all errors as retryable - let simple_classifier = |_: &E| RetryErrorType::Retryable; - - retry_with_exponential_backoff_classified( - runtime, - policy, - simple_classifier, - operation_name, - operation, - ) - .await -} - -/// Enhanced retry with exponential backoff, jitter, and error classification. +/// Retries the given operation with exponential backoff, jitter, and error classification. /// /// This function provides sophisticated retry behavior by classifying errors /// and honoring server-provided throttling hints (e.g., HTTP Retry-After, gRPC RetryInfo). @@ -125,7 +87,7 @@ where /// A `Result` containing the operation's result or an error if max retries are reached /// or a non-retryable error occurs. #[cfg(feature = "experimental_async_runtime")] -pub async fn retry_with_exponential_backoff_classified( +pub async fn retry_with_backoff( runtime: R, policy: RetryPolicy, error_classifier: C, @@ -186,9 +148,10 @@ where /// No-op retry function for when experimental_async_runtime is not enabled. /// This function will execute the operation exactly once without any retries. #[cfg(not(feature = "experimental_async_runtime"))] -pub async fn retry_with_exponential_backoff( +pub async fn retry_with_backoff( _runtime: R, _policy: RetryPolicy, + _error_classifier: C, _operation_name: &str, mut operation: F, ) -> Result @@ -227,9 +190,13 @@ mod tests { jitter_ms: 100, }; - let result = retry_with_exponential_backoff(runtime, policy, "test_operation", || { - Box::pin(async { Ok::<_, ()>("success") }) - }) + let result = retry_with_backoff( + runtime, + policy, + |_: &()| RetryErrorType::Retryable, + "test_operation", + || Box::pin(async { Ok::<_, ()>("success") }), + ) .await; assert_eq!(result, Ok("success")); @@ -248,16 +215,22 @@ mod tests { let attempts = AtomicUsize::new(0); - let result = retry_with_exponential_backoff(runtime, policy, "test_operation", || { - let attempt = attempts.fetch_add(1, Ordering::SeqCst); - Box::pin(async move { - if attempt < 2 { - Err::<&str, &str>("error") // Fail the first two attempts - } else { - Ok::<&str, &str>("success") // Succeed on the third attempt - } - }) - }) + let result = retry_with_backoff( + runtime, + policy, + |_: &&str| RetryErrorType::Retryable, + "test_operation", + || { + let attempt = attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async move { + if attempt < 2 { + Err::<&str, &str>("error") // Fail the first two attempts + } else { + Ok::<&str, &str>("success") // Succeed on the third attempt + } + }) + }, + ) .await; assert_eq!(result, Ok("success")); @@ -277,10 +250,16 @@ mod tests { let attempts = AtomicUsize::new(0); - let result = retry_with_exponential_backoff(runtime, policy, "test_operation", || { - attempts.fetch_add(1, Ordering::SeqCst); - Box::pin(async { Err::<(), _>("error") }) // Always fail - }) + let result = retry_with_backoff( + runtime, + policy, + |_: &&str| RetryErrorType::Retryable, + "test_operation", + || { + attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async { Err::<(), _>("error") }) // Always fail + }, + ) .await; assert_eq!(result, Err("error")); @@ -300,9 +279,15 @@ mod tests { let result = timeout( Duration::from_secs(1), - retry_with_exponential_backoff(runtime, policy, "test_operation", || { - Box::pin(async { Err::<(), _>("error") }) // Always fail - }), + retry_with_backoff( + runtime, + policy, + |_: &&str| RetryErrorType::Retryable, + "test_operation", + || { + Box::pin(async { Err::<(), _>("error") }) // Always fail + }, + ), ) .await; @@ -337,16 +322,10 @@ mod tests { // Classifier that returns non-retryable let classifier = |_: &()| RetryErrorType::NonRetryable; - let result = retry_with_exponential_backoff_classified( - runtime, - policy, - classifier, - "test_operation", - || { - attempts.fetch_add(1, Ordering::SeqCst); - Box::pin(async { Err::<(), _>(()) }) // Always fail - }, - ) + let result = retry_with_backoff(runtime, policy, classifier, "test_operation", || { + attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async { Err::<(), _>(()) }) // Always fail + }) .await; assert!(result.is_err()); @@ -368,22 +347,16 @@ mod tests { // Classifier that returns retryable let classifier = |_: &()| RetryErrorType::Retryable; - let result = retry_with_exponential_backoff_classified( - runtime, - policy, - classifier, - "test_operation", - || { - let attempt = attempts.fetch_add(1, Ordering::SeqCst); - Box::pin(async move { - if attempt < 1 { - Err::<&str, ()>(()) // Fail first attempt - } else { - Ok("success") // Succeed on retry - } - }) - }, - ) + let result = retry_with_backoff(runtime, policy, classifier, "test_operation", || { + let attempt = attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async move { + if attempt < 1 { + Err::<&str, ()>(()) // Fail first attempt + } else { + Ok("success") // Succeed on retry + } + }) + }) .await; assert_eq!(result, Ok("success")); @@ -407,22 +380,16 @@ mod tests { let start_time = std::time::Instant::now(); - let result = retry_with_exponential_backoff_classified( - runtime, - policy, - classifier, - "test_operation", - || { - let attempt = attempts.fetch_add(1, Ordering::SeqCst); - Box::pin(async move { - if attempt < 1 { - Err::<&str, ()>(()) // Fail first attempt (will be throttled) - } else { - Ok("success") // Succeed on retry - } - }) - }, - ) + let result = retry_with_backoff(runtime, policy, classifier, "test_operation", || { + let attempt = attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async move { + if attempt < 1 { + Err::<&str, ()>(()) // Fail first attempt (will be throttled) + } else { + Ok("success") // Succeed on retry + } + }) + }) .await; let elapsed = start_time.elapsed(); @@ -447,16 +414,10 @@ mod tests { // Classifier that returns retryable let classifier = |_: &()| RetryErrorType::Retryable; - let result = retry_with_exponential_backoff_classified( - runtime, - policy, - classifier, - "test_operation", - || { - attempts.fetch_add(1, Ordering::SeqCst); - Box::pin(async { Err::<(), _>(()) }) // Always fail - }, - ) + let result = retry_with_backoff(runtime, policy, classifier, "test_operation", || { + attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async { Err::<(), _>(()) }) // Always fail + }) .await; assert!(result.is_err()); @@ -482,22 +443,16 @@ mod tests { _ => RetryErrorType::Retryable, }; - let result = retry_with_exponential_backoff_classified( - runtime, - policy, - classifier, - "test_operation", - || { - let attempt = attempts.fetch_add(1, Ordering::SeqCst); - Box::pin(async move { - if attempt < 2 { - Err(attempt) // Return attempt number as error - } else { - Ok("success") // Succeed on third attempt - } - }) - }, - ) + let result = retry_with_backoff(runtime, policy, classifier, "test_operation", || { + let attempt = attempts.fetch_add(1, Ordering::SeqCst); + Box::pin(async move { + if attempt < 2 { + Err(attempt) // Return attempt number as error + } else { + Ok("success") // Succeed on third attempt + } + }) + }) .await; assert_eq!(result, Ok("success")); From 61766de1a23b85dd70a0845ca26aa3da85913051 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 12 Aug 2025 13:10:07 +0200 Subject: [PATCH 14/27] Implement retry for http trace --- opentelemetry-otlp/Cargo.toml | 5 + opentelemetry-otlp/src/exporter/http/trace.rs | 131 ++++++++++++++++++ opentelemetry-otlp/src/lib.rs | 2 +- .../src/retry_classification.rs | 9 +- 4 files changed, 142 insertions(+), 5 deletions(-) diff --git a/opentelemetry-otlp/Cargo.toml b/opentelemetry-otlp/Cargo.toml index 4c70fe1205..10939cac5f 100644 --- a/opentelemetry-otlp/Cargo.toml +++ b/opentelemetry-otlp/Cargo.toml @@ -83,6 +83,11 @@ tls-webpki-roots = ["tls", "tonic/tls-webpki-roots"] # http binary http-proto = ["prost", "opentelemetry-http", "opentelemetry-proto/gen-tonic-messages", "http", "trace", "metrics"] + +# http with retry support. +# What should we do with this? We need the async_runtime. gRPC exporters already need it. +http-retry = ["opentelemetry_sdk/experimental_async_runtime", "opentelemetry_sdk/rt-tokio", "tokio"] + http-json = ["serde_json", "prost", "opentelemetry-http", "opentelemetry-proto/gen-tonic-messages", "opentelemetry-proto/with-serde", "http", "trace", "metrics"] reqwest-blocking-client = ["reqwest/blocking", "opentelemetry-http/reqwest-blocking"] reqwest-client = ["reqwest", "opentelemetry-http/reqwest"] diff --git a/opentelemetry-otlp/src/exporter/http/trace.rs b/opentelemetry-otlp/src/exporter/http/trace.rs index 28bc7fc5fb..b94d74235d 100644 --- a/opentelemetry-otlp/src/exporter/http/trace.rs +++ b/opentelemetry-otlp/src/exporter/http/trace.rs @@ -8,7 +8,138 @@ use opentelemetry_sdk::{ trace::{SpanData, SpanExporter}, }; +#[cfg(feature = "http-retry")] +use crate::retry_classification::http::classify_http_error; +#[cfg(feature = "http-retry")] +use opentelemetry_sdk::retry::{retry_with_backoff, RetryErrorType, RetryPolicy}; +#[cfg(feature = "http-retry")] +use opentelemetry_sdk::runtime::Tokio; + +#[cfg(feature = "http-retry")] +/// HTTP-specific error wrapper for retry classification +#[derive(Debug)] +struct HttpExportError { + status_code: u16, + retry_after: Option, + message: String, +} + +#[cfg(feature = "http-retry")] +/// Classify HTTP export errors for retry decisions +fn classify_http_export_error(error: &HttpExportError) -> RetryErrorType { + classify_http_error(error.status_code, error.retry_after.as_deref()) +} + impl SpanExporter for OtlpHttpClient { + #[cfg(feature = "http-retry")] + async fn export(&self, batch: Vec) -> OTelSdkResult { + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + let batch = Arc::new(batch); + + retry_with_backoff( + Tokio, + policy, + classify_http_export_error, + "HttpTracesClient.Export", + || async { + let batch_clone = Arc::clone(&batch); + + // Get client + let client = self + .client + .lock() + .map_err(|e| HttpExportError { + status_code: 500, + retry_after: None, + message: format!("Mutex lock failed: {e}"), + })? + .as_ref() + .ok_or_else(|| HttpExportError { + status_code: 500, + retry_after: None, + message: "Exporter already shutdown".to_string(), + })? + .clone(); + + // Build request body + let (body, content_type, content_encoding) = self + .build_trace_export_body((*batch_clone).clone()) + .map_err(|e| HttpExportError { + status_code: 400, + retry_after: None, + message: format!("Failed to build request body: {e}"), + })?; + + // Build HTTP request + let mut request_builder = http::Request::builder() + .method(Method::POST) + .uri(&self.collector_endpoint) + .header(CONTENT_TYPE, content_type); + + if let Some(encoding) = content_encoding { + request_builder = request_builder.header("Content-Encoding", encoding); + } + + let mut request = + request_builder + .body(body.into()) + .map_err(|e| HttpExportError { + status_code: 400, + retry_after: None, + message: format!("Failed to build HTTP request: {e}"), + })?; + + for (k, v) in &self.headers { + request.headers_mut().insert(k.clone(), v.clone()); + } + + let request_uri = request.uri().to_string(); + otel_debug!(name: "HttpTracesClient.ExportStarted"); + + // Send request + let response = client.send_bytes(request).await.map_err(|e| { + HttpExportError { + status_code: 0, // Network error + retry_after: None, + message: format!("Network error: {e:?}"), + } + })?; + + let status_code = response.status().as_u16(); + let retry_after = response + .headers() + .get("retry-after") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + if !response.status().is_success() { + return Err(HttpExportError { + status_code, + retry_after, + message: format!( + "HTTP export failed. Url: {}, Status: {}, Response: {:?}", + request_uri, + status_code, + response.body() + ), + }); + } + + otel_debug!(name: "HttpTracesClient.ExportSucceeded"); + Ok(()) + }, + ) + .await + .map_err(|e| OTelSdkError::InternalFailure(e.message)) + } + + #[cfg(not(feature = "http-retry"))] async fn export(&self, batch: Vec) -> OTelSdkResult { let client = match self .client diff --git a/opentelemetry-otlp/src/lib.rs b/opentelemetry-otlp/src/lib.rs index 9e21ffe6b4..e8de66d273 100644 --- a/opentelemetry-otlp/src/lib.rs +++ b/opentelemetry-otlp/src/lib.rs @@ -366,7 +366,7 @@ mod metric; #[cfg(any(feature = "http-proto", feature = "http-json", feature = "grpc-tonic"))] mod span; -#[cfg(feature = "grpc-tonic")] +#[cfg(any(feature = "grpc-tonic", feature = "http-retry"))] pub mod retry_classification; pub use crate::exporter::Compression; diff --git a/opentelemetry-otlp/src/retry_classification.rs b/opentelemetry-otlp/src/retry_classification.rs index 63a395bc3e..06a8d914ad 100644 --- a/opentelemetry-otlp/src/retry_classification.rs +++ b/opentelemetry-otlp/src/retry_classification.rs @@ -60,14 +60,14 @@ pub mod http { fn parse_retry_after(retry_after: &str) -> Option { // Try parsing as seconds first if let Ok(seconds) = retry_after.trim().parse::() { - // Cap at 10 minutes for safety + // Cap at 10 minutes. TODO - what's sensible here? let capped_seconds = seconds.min(600); return Some(Duration::from_secs(capped_seconds)); } // Try parsing as HTTP date if let Ok(delay_seconds) = parse_http_date_to_delay(retry_after) { - // Cap at 10 minutes for safety + // Cap at 10 minutes. TODO - what's sensible here? let capped_seconds = delay_seconds.min(600); return Some(Duration::from_secs(capped_seconds)); } @@ -78,7 +78,7 @@ pub mod http { /// Parses HTTP date format and returns delay in seconds from now. /// /// This is a simplified parser for the most common HTTP date format. - /// In production, you might want to use a proper HTTP date parsing library. + /// TODO - should we use a library here? fn parse_http_date_to_delay(date_str: &str) -> Result { // For now, return error - would need proper HTTP date parsing // This could be implemented with chrono or similar @@ -88,6 +88,7 @@ pub mod http { } /// gRPC-specific error classification with RetryInfo support. +#[cfg(feature = "grpc-tonic")] pub mod grpc { use super::*; @@ -122,7 +123,7 @@ pub mod grpc { tonic::Code::ResourceExhausted => { if let Some(seconds) = retry_info_seconds { // Server signals recovery is possible - use throttled retry - let capped_seconds = seconds.min(600); // Cap at 10 minutes for safety + let capped_seconds = seconds.min(600); // Cap at 10 minutes. TODO - what's sensible here? return RetryErrorType::Throttled(std::time::Duration::from_secs( capped_seconds, )); From 2371f39ffd273ba9693f5a6a8c6608dd95f9dd60 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 12 Aug 2025 13:30:03 +0200 Subject: [PATCH 15/27] Implement retry for http metrics and logs --- opentelemetry-otlp/src/exporter/http/logs.rs | 115 ++++++++++++++++++ .../src/exporter/http/metrics.rs | 114 +++++++++++++++++ opentelemetry-otlp/src/exporter/http/mod.rs | 31 +++++ opentelemetry-otlp/src/exporter/http/trace.rs | 61 ++++------ 4 files changed, 282 insertions(+), 39 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/http/logs.rs b/opentelemetry-otlp/src/exporter/http/logs.rs index 3a66266652..cc61a10474 100644 --- a/opentelemetry-otlp/src/exporter/http/logs.rs +++ b/opentelemetry-otlp/src/exporter/http/logs.rs @@ -4,9 +4,124 @@ use http::{header::CONTENT_TYPE, Method}; use opentelemetry::otel_debug; use opentelemetry_sdk::error::{OTelSdkError, OTelSdkResult}; use opentelemetry_sdk::logs::{LogBatch, LogExporter}; +#[cfg(feature = "http-retry")] +use std::sync::Arc; use std::time; +#[cfg(feature = "http-retry")] +use super::{classify_http_export_error, HttpExportError, HttpRetryData}; +#[cfg(feature = "http-retry")] +use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; +#[cfg(feature = "http-retry")] +use opentelemetry_sdk::runtime::Tokio; + impl LogExporter for OtlpHttpClient { + #[cfg(feature = "http-retry")] + async fn export(&self, batch: LogBatch<'_>) -> OTelSdkResult { + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + // Build request body once before retry loop since LogBatch contains borrowed data + let (body, content_type, content_encoding) = self + .build_logs_export_body(batch) + .map_err(OTelSdkError::InternalFailure)?; + + let retry_data = Arc::new(HttpRetryData { + body, + headers: self.headers.clone(), + endpoint: self.collector_endpoint.to_string(), + }); + + retry_with_backoff( + Tokio, + policy, + classify_http_export_error, + "HttpLogsClient.Export", + || async { + // Get client + let client = self + .client + .lock() + .map_err(|e| HttpExportError { + status_code: 500, + retry_after: None, + message: format!("Mutex lock failed: {e}"), + })? + .as_ref() + .ok_or_else(|| HttpExportError { + status_code: 500, + retry_after: None, + message: "Exporter already shutdown".to_string(), + })? + .clone(); + + // Build HTTP request + let mut request_builder = http::Request::builder() + .method(Method::POST) + .uri(&retry_data.endpoint) + .header(CONTENT_TYPE, content_type); + + if let Some(encoding) = content_encoding { + request_builder = request_builder.header(CONTENT_ENCODING, encoding); + } + + let mut request = request_builder + .body(retry_data.body.clone().into()) + .map_err(|e| HttpExportError { + status_code: 400, + retry_after: None, + message: format!("Failed to build HTTP request: {e}"), + })?; + + for (k, v) in &retry_data.headers { + request.headers_mut().insert(k.clone(), v.clone()); + } + + let request_uri = request.uri().to_string(); + otel_debug!(name: "HttpLogsClient.ExportStarted"); + + // Send request + let response = client.send_bytes(request).await.map_err(|e| { + HttpExportError { + status_code: 0, // Network error + retry_after: None, + message: format!("Network error: {e:?}"), + } + })?; + + let status_code = response.status().as_u16(); + let retry_after = response + .headers() + .get("retry-after") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + if !response.status().is_success() { + return Err(HttpExportError { + status_code, + retry_after, + message: format!( + "HTTP export failed. Url: {}, Status: {}, Response: {:?}", + request_uri, + status_code, + response.body() + ), + }); + } + + otel_debug!(name: "HttpLogsClient.ExportSucceeded"); + Ok(()) + }, + ) + .await + .map_err(|e| OTelSdkError::InternalFailure(e.message)) + } + + #[cfg(not(feature = "http-retry"))] async fn export(&self, batch: LogBatch<'_>) -> OTelSdkResult { let client = self .client diff --git a/opentelemetry-otlp/src/exporter/http/metrics.rs b/opentelemetry-otlp/src/exporter/http/metrics.rs index 11a3041585..b022754486 100644 --- a/opentelemetry-otlp/src/exporter/http/metrics.rs +++ b/opentelemetry-otlp/src/exporter/http/metrics.rs @@ -8,7 +8,121 @@ use opentelemetry_sdk::metrics::data::ResourceMetrics; use super::OtlpHttpClient; +#[cfg(feature = "http-retry")] +use super::{classify_http_export_error, HttpExportError, HttpRetryData}; +#[cfg(feature = "http-retry")] +use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; +#[cfg(feature = "http-retry")] +use opentelemetry_sdk::runtime::Tokio; + impl MetricsClient for OtlpHttpClient { + #[cfg(feature = "http-retry")] + async fn export(&self, metrics: &ResourceMetrics) -> OTelSdkResult { + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + // Build request body once before retry loop + let (body, content_type, content_encoding) = + self.build_metrics_export_body(metrics).ok_or_else(|| { + OTelSdkError::InternalFailure("Failed to serialize metrics".to_string()) + })?; + + let retry_data = Arc::new(HttpRetryData { + body, + headers: self.headers.clone(), + endpoint: self.collector_endpoint.to_string(), + }); + + retry_with_backoff( + Tokio, + policy, + classify_http_export_error, + "HttpMetricsClient.Export", + || async { + // Get client + let client = self + .client + .lock() + .map_err(|e| HttpExportError { + status_code: 500, + retry_after: None, + message: format!("Mutex lock failed: {e}"), + })? + .as_ref() + .ok_or_else(|| HttpExportError { + status_code: 500, + retry_after: None, + message: "Exporter already shutdown".to_string(), + })? + .clone(); + + // Build HTTP request + let mut request_builder = http::Request::builder() + .method(Method::POST) + .uri(&retry_data.endpoint) + .header(CONTENT_TYPE, content_type); + + if let Some(encoding) = content_encoding { + request_builder = request_builder.header("Content-Encoding", encoding); + } + + let mut request = request_builder + .body(retry_data.body.clone().into()) + .map_err(|e| HttpExportError { + status_code: 400, + retry_after: None, + message: format!("Failed to build HTTP request: {e}"), + })?; + + for (k, v) in &retry_data.headers { + request.headers_mut().insert(k.clone(), v.clone()); + } + + let request_uri = request.uri().to_string(); + otel_debug!(name: "HttpMetricsClient.ExportStarted"); + + // Send request + let response = client.send_bytes(request).await.map_err(|e| { + HttpExportError { + status_code: 0, // Network error + retry_after: None, + message: format!("Network error: {e:?}"), + } + })?; + + let status_code = response.status().as_u16(); + let retry_after = response + .headers() + .get("retry-after") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + if !response.status().is_success() { + return Err(HttpExportError { + status_code, + retry_after, + message: format!( + "HTTP export failed. Url: {}, Status: {}, Response: {:?}", + request_uri, + status_code, + response.body() + ), + }); + } + + otel_debug!(name: "HttpMetricsClient.ExportSucceeded"); + Ok(()) + }, + ) + .await + .map_err(|e| OTelSdkError::InternalFailure(e.message)) + } + + #[cfg(not(feature = "http-retry"))] async fn export(&self, metrics: &ResourceMetrics) -> OTelSdkResult { let client = self .client diff --git a/opentelemetry-otlp/src/exporter/http/mod.rs b/opentelemetry-otlp/src/exporter/http/mod.rs index 5cb25ea37e..e51c950432 100644 --- a/opentelemetry-otlp/src/exporter/http/mod.rs +++ b/opentelemetry-otlp/src/exporter/http/mod.rs @@ -22,6 +22,37 @@ use std::str::FromStr; use std::sync::{Arc, Mutex}; use std::time::Duration; +#[cfg(feature = "http-retry")] +use crate::retry_classification::http::classify_http_error; +#[cfg(feature = "http-retry")] +use opentelemetry_sdk::retry::RetryErrorType; + +// Shared HTTP retry functionality +#[cfg(feature = "http-retry")] +/// HTTP-specific error wrapper for retry classification +#[derive(Debug)] +pub(crate) struct HttpExportError { + pub status_code: u16, + pub retry_after: Option, + pub message: String, +} + +#[cfg(feature = "http-retry")] +/// Classify HTTP export errors for retry decisions +pub(crate) fn classify_http_export_error(error: &HttpExportError) -> RetryErrorType { + classify_http_error(error.status_code, error.retry_after.as_deref()) +} + +#[cfg(feature = "http-retry")] +/// Shared HTTP request data for retry attempts - optimizes Arc usage by bundling all data +/// we need to pass into the retry handler +#[derive(Debug)] +pub(crate) struct HttpRetryData { + pub body: Vec, + pub headers: HashMap, + pub endpoint: String, +} + #[cfg(feature = "metrics")] mod metrics; diff --git a/opentelemetry-otlp/src/exporter/http/trace.rs b/opentelemetry-otlp/src/exporter/http/trace.rs index b94d74235d..c55d4f4609 100644 --- a/opentelemetry-otlp/src/exporter/http/trace.rs +++ b/opentelemetry-otlp/src/exporter/http/trace.rs @@ -9,27 +9,12 @@ use opentelemetry_sdk::{ }; #[cfg(feature = "http-retry")] -use crate::retry_classification::http::classify_http_error; +use super::{classify_http_export_error, HttpExportError, HttpRetryData}; #[cfg(feature = "http-retry")] -use opentelemetry_sdk::retry::{retry_with_backoff, RetryErrorType, RetryPolicy}; +use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; #[cfg(feature = "http-retry")] use opentelemetry_sdk::runtime::Tokio; -#[cfg(feature = "http-retry")] -/// HTTP-specific error wrapper for retry classification -#[derive(Debug)] -struct HttpExportError { - status_code: u16, - retry_after: Option, - message: String, -} - -#[cfg(feature = "http-retry")] -/// Classify HTTP export errors for retry decisions -fn classify_http_export_error(error: &HttpExportError) -> RetryErrorType { - classify_http_error(error.status_code, error.retry_after.as_deref()) -} - impl SpanExporter for OtlpHttpClient { #[cfg(feature = "http-retry")] async fn export(&self, batch: Vec) -> OTelSdkResult { @@ -40,7 +25,17 @@ impl SpanExporter for OtlpHttpClient { jitter_ms: 100, }; - let batch = Arc::new(batch); + // Build request body once before retry loop + let (body, content_type, content_encoding) = + self.build_trace_export_body(batch).map_err(|e| { + OTelSdkError::InternalFailure(format!("Failed to build request body: {e}")) + })?; + + let retry_data = Arc::new(HttpRetryData { + body, + headers: self.headers.clone(), + endpoint: self.collector_endpoint.to_string(), + }); retry_with_backoff( Tokio, @@ -48,8 +43,6 @@ impl SpanExporter for OtlpHttpClient { classify_http_export_error, "HttpTracesClient.Export", || async { - let batch_clone = Arc::clone(&batch); - // Get client let client = self .client @@ -67,35 +60,25 @@ impl SpanExporter for OtlpHttpClient { })? .clone(); - // Build request body - let (body, content_type, content_encoding) = self - .build_trace_export_body((*batch_clone).clone()) - .map_err(|e| HttpExportError { - status_code: 400, - retry_after: None, - message: format!("Failed to build request body: {e}"), - })?; - // Build HTTP request let mut request_builder = http::Request::builder() .method(Method::POST) - .uri(&self.collector_endpoint) + .uri(&retry_data.endpoint) .header(CONTENT_TYPE, content_type); if let Some(encoding) = content_encoding { request_builder = request_builder.header("Content-Encoding", encoding); } - let mut request = - request_builder - .body(body.into()) - .map_err(|e| HttpExportError { - status_code: 400, - retry_after: None, - message: format!("Failed to build HTTP request: {e}"), - })?; + let mut request = request_builder + .body(retry_data.body.clone().into()) + .map_err(|e| HttpExportError { + status_code: 400, + retry_after: None, + message: format!("Failed to build HTTP request: {e}"), + })?; - for (k, v) in &self.headers { + for (k, v) in &retry_data.headers { request.headers_mut().insert(k.clone(), v.clone()); } From c4cc2138ff5d221a66224ff8d24f2886c4b8cfd6 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 12 Aug 2025 13:39:07 +0200 Subject: [PATCH 16/27] changelog --- opentelemetry-otlp/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/opentelemetry-otlp/CHANGELOG.md b/opentelemetry-otlp/CHANGELOG.md index 45a2196ffc..1e5fb0ec75 100644 --- a/opentelemetry-otlp/CHANGELOG.md +++ b/opentelemetry-otlp/CHANGELOG.md @@ -8,6 +8,7 @@ Released 2025-Sep-25 - Update `opentelemetry-proto` and `opentelemetry-http` dependency version to 0.31.0 - Add HTTP compression support with `gzip-http` and `zstd-http` feature flags +- Add retry with exponential backoff and throttling support for HTTP and gRPC exporters ## 0.30.0 From 1b68212cb30293a5334fe61c57e866808c6b97ea Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 2 Sep 2025 10:59:45 +0200 Subject: [PATCH 17/27] chore: wrap headers in arc to save cloning per batch --- opentelemetry-otlp/src/exporter/http/logs.rs | 4 ++-- opentelemetry-otlp/src/exporter/http/metrics.rs | 4 ++-- opentelemetry-otlp/src/exporter/http/mod.rs | 6 +++--- opentelemetry-otlp/src/exporter/http/trace.rs | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/http/logs.rs b/opentelemetry-otlp/src/exporter/http/logs.rs index cc61a10474..c5c63299c5 100644 --- a/opentelemetry-otlp/src/exporter/http/logs.rs +++ b/opentelemetry-otlp/src/exporter/http/logs.rs @@ -77,7 +77,7 @@ impl LogExporter for OtlpHttpClient { message: format!("Failed to build HTTP request: {e}"), })?; - for (k, v) in &retry_data.headers { + for (k, v) in retry_data.headers.iter() { request.headers_mut().insert(k.clone(), v.clone()); } @@ -147,7 +147,7 @@ impl LogExporter for OtlpHttpClient { .body(body.into()) .map_err(|e| OTelSdkError::InternalFailure(e.to_string()))?; - for (k, v) in &self.headers { + for (k, v) in self.headers.iter() { request.headers_mut().insert(k.clone(), v.clone()); } diff --git a/opentelemetry-otlp/src/exporter/http/metrics.rs b/opentelemetry-otlp/src/exporter/http/metrics.rs index b022754486..f3598f4f52 100644 --- a/opentelemetry-otlp/src/exporter/http/metrics.rs +++ b/opentelemetry-otlp/src/exporter/http/metrics.rs @@ -78,7 +78,7 @@ impl MetricsClient for OtlpHttpClient { message: format!("Failed to build HTTP request: {e}"), })?; - for (k, v) in &retry_data.headers { + for (k, v) in retry_data.headers.iter() { request.headers_mut().insert(k.clone(), v.clone()); } @@ -151,7 +151,7 @@ impl MetricsClient for OtlpHttpClient { .body(body.into()) .map_err(|e| OTelSdkError::InternalFailure(format!("{e:?}")))?; - for (k, v) in &self.headers { + for (k, v) in self.headers.iter() { request.headers_mut().insert(k.clone(), v.clone()); } diff --git a/opentelemetry-otlp/src/exporter/http/mod.rs b/opentelemetry-otlp/src/exporter/http/mod.rs index e51c950432..a21a04570b 100644 --- a/opentelemetry-otlp/src/exporter/http/mod.rs +++ b/opentelemetry-otlp/src/exporter/http/mod.rs @@ -49,7 +49,7 @@ pub(crate) fn classify_http_export_error(error: &HttpExportError) -> RetryErrorT #[derive(Debug)] pub(crate) struct HttpRetryData { pub body: Vec, - pub headers: HashMap, + pub headers: Arc>, pub endpoint: String, } @@ -329,7 +329,7 @@ impl HttpExporterBuilder { pub(crate) struct OtlpHttpClient { client: Mutex>>, collector_endpoint: Uri, - headers: HashMap, + headers: Arc>, protocol: Protocol, _timeout: Duration, compression: Option, @@ -383,7 +383,7 @@ impl OtlpHttpClient { OtlpHttpClient { client: Mutex::new(Some(client)), collector_endpoint, - headers, + headers: Arc::new(headers), protocol, _timeout: timeout, compression, diff --git a/opentelemetry-otlp/src/exporter/http/trace.rs b/opentelemetry-otlp/src/exporter/http/trace.rs index c55d4f4609..feaada955b 100644 --- a/opentelemetry-otlp/src/exporter/http/trace.rs +++ b/opentelemetry-otlp/src/exporter/http/trace.rs @@ -78,7 +78,7 @@ impl SpanExporter for OtlpHttpClient { message: format!("Failed to build HTTP request: {e}"), })?; - for (k, v) in &retry_data.headers { + for (k, v) in retry_data.headers.iter() { request.headers_mut().insert(k.clone(), v.clone()); } @@ -155,7 +155,7 @@ impl SpanExporter for OtlpHttpClient { Err(e) => return Err(OTelSdkError::InternalFailure(e.to_string())), }; - for (k, v) in &self.headers { + for (k, v) in self.headers.iter() { request.headers_mut().insert(k.clone(), v.clone()); } From 2d97decf1c9b1e07eadce7a825362ba204356ebf Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 2 Sep 2025 12:09:27 +0200 Subject: [PATCH 18/27] chore: simplify HttpExportError creation --- opentelemetry-otlp/src/exporter/http/logs.rs | 42 +++++++------------ .../src/exporter/http/metrics.rs | 42 +++++++------------ opentelemetry-otlp/src/exporter/http/mod.rs | 21 ++++++++++ opentelemetry-otlp/src/exporter/http/trace.rs | 42 +++++++------------ 4 files changed, 69 insertions(+), 78 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/http/logs.rs b/opentelemetry-otlp/src/exporter/http/logs.rs index c5c63299c5..a1e584337e 100644 --- a/opentelemetry-otlp/src/exporter/http/logs.rs +++ b/opentelemetry-otlp/src/exporter/http/logs.rs @@ -46,16 +46,10 @@ impl LogExporter for OtlpHttpClient { let client = self .client .lock() - .map_err(|e| HttpExportError { - status_code: 500, - retry_after: None, - message: format!("Mutex lock failed: {e}"), - })? + .map_err(|e| HttpExportError::new(500, format!("Mutex lock failed: {e}")))? .as_ref() - .ok_or_else(|| HttpExportError { - status_code: 500, - retry_after: None, - message: "Exporter already shutdown".to_string(), + .ok_or_else(|| { + HttpExportError::new(500, "Exporter already shutdown".to_string()) })? .clone(); @@ -71,10 +65,8 @@ impl LogExporter for OtlpHttpClient { let mut request = request_builder .body(retry_data.body.clone().into()) - .map_err(|e| HttpExportError { - status_code: 400, - retry_after: None, - message: format!("Failed to build HTTP request: {e}"), + .map_err(|e| { + HttpExportError::new(400, format!("Failed to build HTTP request: {e}")) })?; for (k, v) in retry_data.headers.iter() { @@ -86,11 +78,7 @@ impl LogExporter for OtlpHttpClient { // Send request let response = client.send_bytes(request).await.map_err(|e| { - HttpExportError { - status_code: 0, // Network error - retry_after: None, - message: format!("Network error: {e:?}"), - } + HttpExportError::new(0, format!("Network error: {e:?}")) // Network error })?; let status_code = response.status().as_u16(); @@ -101,15 +89,17 @@ impl LogExporter for OtlpHttpClient { .map(|s| s.to_string()); if !response.status().is_success() { - return Err(HttpExportError { + let message = format!( + "HTTP export failed. Url: {}, Status: {}, Response: {:?}", + request_uri, status_code, - retry_after, - message: format!( - "HTTP export failed. Url: {}, Status: {}, Response: {:?}", - request_uri, - status_code, - response.body() - ), + response.body() + ); + return Err(match retry_after { + Some(retry_after) => { + HttpExportError::with_retry_after(status_code, retry_after, message) + } + None => HttpExportError::new(status_code, message), }); } diff --git a/opentelemetry-otlp/src/exporter/http/metrics.rs b/opentelemetry-otlp/src/exporter/http/metrics.rs index f3598f4f52..0f517b3839 100644 --- a/opentelemetry-otlp/src/exporter/http/metrics.rs +++ b/opentelemetry-otlp/src/exporter/http/metrics.rs @@ -47,16 +47,10 @@ impl MetricsClient for OtlpHttpClient { let client = self .client .lock() - .map_err(|e| HttpExportError { - status_code: 500, - retry_after: None, - message: format!("Mutex lock failed: {e}"), - })? + .map_err(|e| HttpExportError::new(500, format!("Mutex lock failed: {e}")))? .as_ref() - .ok_or_else(|| HttpExportError { - status_code: 500, - retry_after: None, - message: "Exporter already shutdown".to_string(), + .ok_or_else(|| { + HttpExportError::new(500, "Exporter already shutdown".to_string()) })? .clone(); @@ -72,10 +66,8 @@ impl MetricsClient for OtlpHttpClient { let mut request = request_builder .body(retry_data.body.clone().into()) - .map_err(|e| HttpExportError { - status_code: 400, - retry_after: None, - message: format!("Failed to build HTTP request: {e}"), + .map_err(|e| { + HttpExportError::new(400, format!("Failed to build HTTP request: {e}")) })?; for (k, v) in retry_data.headers.iter() { @@ -87,11 +79,7 @@ impl MetricsClient for OtlpHttpClient { // Send request let response = client.send_bytes(request).await.map_err(|e| { - HttpExportError { - status_code: 0, // Network error - retry_after: None, - message: format!("Network error: {e:?}"), - } + HttpExportError::new(0, format!("Network error: {e:?}")) // Network error })?; let status_code = response.status().as_u16(); @@ -102,15 +90,17 @@ impl MetricsClient for OtlpHttpClient { .map(|s| s.to_string()); if !response.status().is_success() { - return Err(HttpExportError { + let message = format!( + "HTTP export failed. Url: {}, Status: {}, Response: {:?}", + request_uri, status_code, - retry_after, - message: format!( - "HTTP export failed. Url: {}, Status: {}, Response: {:?}", - request_uri, - status_code, - response.body() - ), + response.body() + ); + return Err(match retry_after { + Some(retry_after) => { + HttpExportError::with_retry_after(status_code, retry_after, message) + } + None => HttpExportError::new(status_code, message), }); } diff --git a/opentelemetry-otlp/src/exporter/http/mod.rs b/opentelemetry-otlp/src/exporter/http/mod.rs index a21a04570b..1f7f3641cf 100644 --- a/opentelemetry-otlp/src/exporter/http/mod.rs +++ b/opentelemetry-otlp/src/exporter/http/mod.rs @@ -37,6 +37,27 @@ pub(crate) struct HttpExportError { pub message: String, } +#[cfg(feature = "http-retry")] +impl HttpExportError { + /// Create a new HttpExportError without retry-after header + pub(crate) fn new(status_code: u16, message: String) -> Self { + Self { + status_code, + retry_after: None, + message, + } + } + + /// Create a new HttpExportError with retry-after header + pub(crate) fn with_retry_after(status_code: u16, retry_after: String, message: String) -> Self { + Self { + status_code, + retry_after: Some(retry_after), + message, + } + } +} + #[cfg(feature = "http-retry")] /// Classify HTTP export errors for retry decisions pub(crate) fn classify_http_export_error(error: &HttpExportError) -> RetryErrorType { diff --git a/opentelemetry-otlp/src/exporter/http/trace.rs b/opentelemetry-otlp/src/exporter/http/trace.rs index feaada955b..073434f68a 100644 --- a/opentelemetry-otlp/src/exporter/http/trace.rs +++ b/opentelemetry-otlp/src/exporter/http/trace.rs @@ -47,16 +47,10 @@ impl SpanExporter for OtlpHttpClient { let client = self .client .lock() - .map_err(|e| HttpExportError { - status_code: 500, - retry_after: None, - message: format!("Mutex lock failed: {e}"), - })? + .map_err(|e| HttpExportError::new(500, format!("Mutex lock failed: {e}")))? .as_ref() - .ok_or_else(|| HttpExportError { - status_code: 500, - retry_after: None, - message: "Exporter already shutdown".to_string(), + .ok_or_else(|| { + HttpExportError::new(500, "Exporter already shutdown".to_string()) })? .clone(); @@ -72,10 +66,8 @@ impl SpanExporter for OtlpHttpClient { let mut request = request_builder .body(retry_data.body.clone().into()) - .map_err(|e| HttpExportError { - status_code: 400, - retry_after: None, - message: format!("Failed to build HTTP request: {e}"), + .map_err(|e| { + HttpExportError::new(400, format!("Failed to build HTTP request: {e}")) })?; for (k, v) in retry_data.headers.iter() { @@ -87,11 +79,7 @@ impl SpanExporter for OtlpHttpClient { // Send request let response = client.send_bytes(request).await.map_err(|e| { - HttpExportError { - status_code: 0, // Network error - retry_after: None, - message: format!("Network error: {e:?}"), - } + HttpExportError::new(0, format!("Network error: {e:?}")) // Network error })?; let status_code = response.status().as_u16(); @@ -102,15 +90,17 @@ impl SpanExporter for OtlpHttpClient { .map(|s| s.to_string()); if !response.status().is_success() { - return Err(HttpExportError { + let message = format!( + "HTTP export failed. Url: {}, Status: {}, Response: {:?}", + request_uri, status_code, - retry_after, - message: format!( - "HTTP export failed. Url: {}, Status: {}, Response: {:?}", - request_uri, - status_code, - response.body() - ), + response.body() + ); + return Err(match retry_after { + Some(retry_after) => { + HttpExportError::with_retry_after(status_code, retry_after, message) + } + None => HttpExportError::new(status_code, message), }); } From 579efce19de71fe45c59009155f85bbc503a79e1 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Tue, 2 Sep 2025 12:40:46 +0200 Subject: [PATCH 19/27] chore: factor retry logic out --- opentelemetry-otlp/src/exporter/http/logs.rs | 156 +--------------- .../src/exporter/http/metrics.rs | 167 +----------------- opentelemetry-otlp/src/exporter/http/mod.rs | 160 ++++++++++++++++- opentelemetry-otlp/src/exporter/http/trace.rs | 163 +---------------- 4 files changed, 164 insertions(+), 482 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/http/logs.rs b/opentelemetry-otlp/src/exporter/http/logs.rs index a1e584337e..8690250f81 100644 --- a/opentelemetry-otlp/src/exporter/http/logs.rs +++ b/opentelemetry-otlp/src/exporter/http/logs.rs @@ -1,166 +1,16 @@ use super::OtlpHttpClient; -use http::header::CONTENT_ENCODING; -use http::{header::CONTENT_TYPE, Method}; -use opentelemetry::otel_debug; use opentelemetry_sdk::error::{OTelSdkError, OTelSdkResult}; use opentelemetry_sdk::logs::{LogBatch, LogExporter}; -#[cfg(feature = "http-retry")] -use std::sync::Arc; use std::time; -#[cfg(feature = "http-retry")] -use super::{classify_http_export_error, HttpExportError, HttpRetryData}; -#[cfg(feature = "http-retry")] -use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; -#[cfg(feature = "http-retry")] -use opentelemetry_sdk::runtime::Tokio; - impl LogExporter for OtlpHttpClient { - #[cfg(feature = "http-retry")] async fn export(&self, batch: LogBatch<'_>) -> OTelSdkResult { - let policy = RetryPolicy { - max_retries: 3, - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, - }; - - // Build request body once before retry loop since LogBatch contains borrowed data - let (body, content_type, content_encoding) = self - .build_logs_export_body(batch) - .map_err(OTelSdkError::InternalFailure)?; - - let retry_data = Arc::new(HttpRetryData { - body, - headers: self.headers.clone(), - endpoint: self.collector_endpoint.to_string(), - }); - - retry_with_backoff( - Tokio, - policy, - classify_http_export_error, + self.export_http_with_retry( + batch, + OtlpHttpClient::build_logs_export_body, "HttpLogsClient.Export", - || async { - // Get client - let client = self - .client - .lock() - .map_err(|e| HttpExportError::new(500, format!("Mutex lock failed: {e}")))? - .as_ref() - .ok_or_else(|| { - HttpExportError::new(500, "Exporter already shutdown".to_string()) - })? - .clone(); - - // Build HTTP request - let mut request_builder = http::Request::builder() - .method(Method::POST) - .uri(&retry_data.endpoint) - .header(CONTENT_TYPE, content_type); - - if let Some(encoding) = content_encoding { - request_builder = request_builder.header(CONTENT_ENCODING, encoding); - } - - let mut request = request_builder - .body(retry_data.body.clone().into()) - .map_err(|e| { - HttpExportError::new(400, format!("Failed to build HTTP request: {e}")) - })?; - - for (k, v) in retry_data.headers.iter() { - request.headers_mut().insert(k.clone(), v.clone()); - } - - let request_uri = request.uri().to_string(); - otel_debug!(name: "HttpLogsClient.ExportStarted"); - - // Send request - let response = client.send_bytes(request).await.map_err(|e| { - HttpExportError::new(0, format!("Network error: {e:?}")) // Network error - })?; - - let status_code = response.status().as_u16(); - let retry_after = response - .headers() - .get("retry-after") - .and_then(|v| v.to_str().ok()) - .map(|s| s.to_string()); - - if !response.status().is_success() { - let message = format!( - "HTTP export failed. Url: {}, Status: {}, Response: {:?}", - request_uri, - status_code, - response.body() - ); - return Err(match retry_after { - Some(retry_after) => { - HttpExportError::with_retry_after(status_code, retry_after, message) - } - None => HttpExportError::new(status_code, message), - }); - } - - otel_debug!(name: "HttpLogsClient.ExportSucceeded"); - Ok(()) - }, ) .await - .map_err(|e| OTelSdkError::InternalFailure(e.message)) - } - - #[cfg(not(feature = "http-retry"))] - async fn export(&self, batch: LogBatch<'_>) -> OTelSdkResult { - let client = self - .client - .lock() - .map_err(|e| OTelSdkError::InternalFailure(format!("Mutex lock failed: {e}")))? - .clone() - .ok_or(OTelSdkError::AlreadyShutdown)?; - - let (body, content_type, content_encoding) = self - .build_logs_export_body(batch) - .map_err(OTelSdkError::InternalFailure)?; - - let mut request_builder = http::Request::builder() - .method(Method::POST) - .uri(&self.collector_endpoint) - .header(CONTENT_TYPE, content_type); - - if let Some(encoding) = content_encoding { - request_builder = request_builder.header(CONTENT_ENCODING, encoding); - } - - let mut request = request_builder - .body(body.into()) - .map_err(|e| OTelSdkError::InternalFailure(e.to_string()))?; - - for (k, v) in self.headers.iter() { - request.headers_mut().insert(k.clone(), v.clone()); - } - - let request_uri = request.uri().to_string(); - otel_debug!(name: "HttpLogsClient.ExportStarted"); - let response = client - .send_bytes(request) - .await - .map_err(|e| OTelSdkError::InternalFailure(format!("{e:?}")))?; - - if !response.status().is_success() { - let error = format!( - "OpenTelemetry logs export failed. Url: {}, Status Code: {}, Response: {:?}", - request_uri, - response.status().as_u16(), - response.body() - ); - otel_debug!(name: "HttpLogsClient.ExportFailed", error = &error); - return Err(OTelSdkError::InternalFailure(error)); - } - - otel_debug!(name: "HttpLogsClient.ExportSucceeded"); - Ok(()) } fn shutdown_with_timeout(&self, _timeout: time::Duration) -> OTelSdkResult { diff --git a/opentelemetry-otlp/src/exporter/http/metrics.rs b/opentelemetry-otlp/src/exporter/http/metrics.rs index 0f517b3839..6acee3821b 100644 --- a/opentelemetry-otlp/src/exporter/http/metrics.rs +++ b/opentelemetry-otlp/src/exporter/http/metrics.rs @@ -1,174 +1,19 @@ -use std::sync::Arc; - use crate::metric::MetricsClient; -use http::{header::CONTENT_TYPE, Method}; -use opentelemetry::otel_debug; use opentelemetry_sdk::error::{OTelSdkError, OTelSdkResult}; use opentelemetry_sdk::metrics::data::ResourceMetrics; use super::OtlpHttpClient; -#[cfg(feature = "http-retry")] -use super::{classify_http_export_error, HttpExportError, HttpRetryData}; -#[cfg(feature = "http-retry")] -use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; -#[cfg(feature = "http-retry")] -use opentelemetry_sdk::runtime::Tokio; - impl MetricsClient for OtlpHttpClient { - #[cfg(feature = "http-retry")] async fn export(&self, metrics: &ResourceMetrics) -> OTelSdkResult { - let policy = RetryPolicy { - max_retries: 3, - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, + let build_body_wrapper = |client: &OtlpHttpClient, metrics: &ResourceMetrics| { + client + .build_metrics_export_body(metrics) + .ok_or_else(|| "Failed to serialize metrics".to_string()) }; - // Build request body once before retry loop - let (body, content_type, content_encoding) = - self.build_metrics_export_body(metrics).ok_or_else(|| { - OTelSdkError::InternalFailure("Failed to serialize metrics".to_string()) - })?; - - let retry_data = Arc::new(HttpRetryData { - body, - headers: self.headers.clone(), - endpoint: self.collector_endpoint.to_string(), - }); - - retry_with_backoff( - Tokio, - policy, - classify_http_export_error, - "HttpMetricsClient.Export", - || async { - // Get client - let client = self - .client - .lock() - .map_err(|e| HttpExportError::new(500, format!("Mutex lock failed: {e}")))? - .as_ref() - .ok_or_else(|| { - HttpExportError::new(500, "Exporter already shutdown".to_string()) - })? - .clone(); - - // Build HTTP request - let mut request_builder = http::Request::builder() - .method(Method::POST) - .uri(&retry_data.endpoint) - .header(CONTENT_TYPE, content_type); - - if let Some(encoding) = content_encoding { - request_builder = request_builder.header("Content-Encoding", encoding); - } - - let mut request = request_builder - .body(retry_data.body.clone().into()) - .map_err(|e| { - HttpExportError::new(400, format!("Failed to build HTTP request: {e}")) - })?; - - for (k, v) in retry_data.headers.iter() { - request.headers_mut().insert(k.clone(), v.clone()); - } - - let request_uri = request.uri().to_string(); - otel_debug!(name: "HttpMetricsClient.ExportStarted"); - - // Send request - let response = client.send_bytes(request).await.map_err(|e| { - HttpExportError::new(0, format!("Network error: {e:?}")) // Network error - })?; - - let status_code = response.status().as_u16(); - let retry_after = response - .headers() - .get("retry-after") - .and_then(|v| v.to_str().ok()) - .map(|s| s.to_string()); - - if !response.status().is_success() { - let message = format!( - "HTTP export failed. Url: {}, Status: {}, Response: {:?}", - request_uri, - status_code, - response.body() - ); - return Err(match retry_after { - Some(retry_after) => { - HttpExportError::with_retry_after(status_code, retry_after, message) - } - None => HttpExportError::new(status_code, message), - }); - } - - otel_debug!(name: "HttpMetricsClient.ExportSucceeded"); - Ok(()) - }, - ) - .await - .map_err(|e| OTelSdkError::InternalFailure(e.message)) - } - - #[cfg(not(feature = "http-retry"))] - async fn export(&self, metrics: &ResourceMetrics) -> OTelSdkResult { - let client = self - .client - .lock() - .map_err(|e| OTelSdkError::InternalFailure(format!("Failed to acquire lock: {e:?}"))) - .and_then(|g| match &*g { - Some(client) => Ok(Arc::clone(client)), - _ => Err(OTelSdkError::AlreadyShutdown), - })?; - - let (body, content_type, content_encoding) = - self.build_metrics_export_body(metrics).ok_or_else(|| { - OTelSdkError::InternalFailure("Failed to serialize metrics".to_string()) - })?; - - let mut request_builder = http::Request::builder() - .method(Method::POST) - .uri(&self.collector_endpoint) - .header(CONTENT_TYPE, content_type); - - if let Some(encoding) = content_encoding { - request_builder = request_builder.header("Content-Encoding", encoding); - } - - let mut request = request_builder - .body(body.into()) - .map_err(|e| OTelSdkError::InternalFailure(format!("{e:?}")))?; - - for (k, v) in self.headers.iter() { - request.headers_mut().insert(k.clone(), v.clone()); - } - - otel_debug!(name: "HttpMetricsClient.ExportStarted"); - let result = client.send_bytes(request).await; - - match result { - Ok(response) => { - if response.status().is_success() { - otel_debug!(name: "HttpMetricsClient.ExportSucceeded"); - Ok(()) - } else { - let error = format!( - "OpenTelemetry metrics export failed. Status Code: {}, Response: {:?}", - response.status().as_u16(), - response.body() - ); - otel_debug!(name: "HttpMetricsClient.ExportFailed", error = &error); - Err(OTelSdkError::InternalFailure(error)) - } - } - Err(e) => { - let error = format!("{e:?}"); - otel_debug!(name: "HttpMetricsClient.ExportFailed", error = &error); - Err(OTelSdkError::InternalFailure(error)) - } - } + self.export_http_with_retry(metrics, build_body_wrapper, "HttpMetricsClient.Export") + .await } fn shutdown(&self) -> OTelSdkResult { diff --git a/opentelemetry-otlp/src/exporter/http/mod.rs b/opentelemetry-otlp/src/exporter/http/mod.rs index 1f7f3641cf..d04c8badc7 100644 --- a/opentelemetry-otlp/src/exporter/http/mod.rs +++ b/opentelemetry-otlp/src/exporter/http/mod.rs @@ -28,31 +28,39 @@ use crate::retry_classification::http::classify_http_error; use opentelemetry_sdk::retry::RetryErrorType; // Shared HTTP retry functionality -#[cfg(feature = "http-retry")] /// HTTP-specific error wrapper for retry classification #[derive(Debug)] pub(crate) struct HttpExportError { + #[cfg(feature = "http-retry")] pub status_code: u16, + #[cfg(feature = "http-retry")] pub retry_after: Option, pub message: String, } -#[cfg(feature = "http-retry")] impl HttpExportError { /// Create a new HttpExportError without retry-after header - pub(crate) fn new(status_code: u16, message: String) -> Self { + pub(crate) fn new(_status_code: u16, message: String) -> Self { Self { - status_code, + #[cfg(feature = "http-retry")] + status_code: _status_code, + #[cfg(feature = "http-retry")] retry_after: None, message, } } /// Create a new HttpExportError with retry-after header - pub(crate) fn with_retry_after(status_code: u16, retry_after: String, message: String) -> Self { + pub(crate) fn with_retry_after( + _status_code: u16, + _retry_after: String, + message: String, + ) -> Self { Self { - status_code, - retry_after: Some(retry_after), + #[cfg(feature = "http-retry")] + status_code: _status_code, + #[cfg(feature = "http-retry")] + retry_after: Some(_retry_after), message, } } @@ -64,7 +72,6 @@ pub(crate) fn classify_http_export_error(error: &HttpExportError) -> RetryErrorT classify_http_error(error.status_code, error.retry_after.as_deref()) } -#[cfg(feature = "http-retry")] /// Shared HTTP request data for retry attempts - optimizes Arc usage by bundling all data /// we need to pass into the retry handler #[derive(Debug)] @@ -360,6 +367,143 @@ pub(crate) struct OtlpHttpClient { } impl OtlpHttpClient { + /// Shared HTTP export logic used by all exporters with retry support + async fn export_http_with_retry( + &self, + data: T, + build_body_fn: F, + operation_name: &'static str, + ) -> opentelemetry_sdk::error::OTelSdkResult + where + F: Fn(&Self, T) -> Result<(Vec, &'static str, Option<&'static str>), String>, + { + #[cfg(feature = "http-retry")] + { + use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; + use opentelemetry_sdk::runtime::Tokio; + + let policy = RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }; + + // Build request body once before retry loop + let (body, content_type, content_encoding) = build_body_fn(self, data) + .map_err(opentelemetry_sdk::error::OTelSdkError::InternalFailure)?; + + let retry_data = Arc::new(HttpRetryData { + body, + headers: self.headers.clone(), + endpoint: self.collector_endpoint.to_string(), + }); + + retry_with_backoff( + Tokio, + policy, + classify_http_export_error, + operation_name, + || async { + self.export_http_once( + &retry_data, + content_type, + content_encoding, + operation_name, + ) + .await + }, + ) + .await + .map_err(|e| opentelemetry_sdk::error::OTelSdkError::InternalFailure(e.message)) + } + + #[cfg(not(feature = "http-retry"))] + { + let (body, content_type, content_encoding) = build_body_fn(self, data) + .map_err(opentelemetry_sdk::error::OTelSdkError::InternalFailure)?; + + let retry_data = HttpRetryData { + body, + headers: self.headers.clone(), + endpoint: self.collector_endpoint.to_string(), + }; + + self.export_http_once(&retry_data, content_type, content_encoding, operation_name) + .await + .map_err(|e| opentelemetry_sdk::error::OTelSdkError::InternalFailure(e.message)) + } + } + + /// Single HTTP export attempt - shared between retry and no-retry paths + async fn export_http_once( + &self, + retry_data: &HttpRetryData, + content_type: &'static str, + content_encoding: Option<&'static str>, + _operation_name: &'static str, + ) -> Result<(), HttpExportError> { + // Get client + let client = self + .client + .lock() + .map_err(|e| HttpExportError::new(500, format!("Mutex lock failed: {e}")))? + .as_ref() + .ok_or_else(|| HttpExportError::new(500, "Exporter already shutdown".to_string()))? + .clone(); + + // Build HTTP request + let mut request_builder = http::Request::builder() + .method(http::Method::POST) + .uri(&retry_data.endpoint) + .header(http::header::CONTENT_TYPE, content_type); + + if let Some(encoding) = content_encoding { + request_builder = request_builder.header("Content-Encoding", encoding); + } + + let mut request = request_builder + .body(retry_data.body.clone().into()) + .map_err(|e| HttpExportError::new(400, format!("Failed to build HTTP request: {e}")))?; + + for (k, v) in retry_data.headers.iter() { + request.headers_mut().insert(k.clone(), v.clone()); + } + + let request_uri = request.uri().to_string(); + otel_debug!(name: "HttpClient.ExportStarted"); + + // Send request + let response = client.send_bytes(request).await.map_err(|e| { + HttpExportError::new(0, format!("Network error: {e:?}")) // Network error + })?; + + let status_code = response.status().as_u16(); + let retry_after = response + .headers() + .get("retry-after") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + if !response.status().is_success() { + let message = format!( + "HTTP export failed. Url: {}, Status: {}, Response: {:?}", + request_uri, + status_code, + response.body() + ); + return Err(match retry_after { + Some(retry_after) => { + HttpExportError::with_retry_after(status_code, retry_after, message) + } + None => HttpExportError::new(status_code, message), + }); + } + + otel_debug!(name: "HttpClient.ExportSucceeded"); + Ok(()) + } + /// Compress data using gzip or zstd if the user has requested it and the relevant feature /// has been enabled. If the user has requested it but the feature has not been enabled, /// we should catch this at exporter build time and never get here. diff --git a/opentelemetry-otlp/src/exporter/http/trace.rs b/opentelemetry-otlp/src/exporter/http/trace.rs index 073434f68a..c77e59efde 100644 --- a/opentelemetry-otlp/src/exporter/http/trace.rs +++ b/opentelemetry-otlp/src/exporter/http/trace.rs @@ -1,174 +1,17 @@ -use std::sync::Arc; - use super::OtlpHttpClient; -use http::{header::CONTENT_TYPE, Method}; -use opentelemetry::otel_debug; use opentelemetry_sdk::{ error::{OTelSdkError, OTelSdkResult}, trace::{SpanData, SpanExporter}, }; -#[cfg(feature = "http-retry")] -use super::{classify_http_export_error, HttpExportError, HttpRetryData}; -#[cfg(feature = "http-retry")] -use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; -#[cfg(feature = "http-retry")] -use opentelemetry_sdk::runtime::Tokio; - impl SpanExporter for OtlpHttpClient { - #[cfg(feature = "http-retry")] async fn export(&self, batch: Vec) -> OTelSdkResult { - let policy = RetryPolicy { - max_retries: 3, - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, - }; - - // Build request body once before retry loop - let (body, content_type, content_encoding) = - self.build_trace_export_body(batch).map_err(|e| { - OTelSdkError::InternalFailure(format!("Failed to build request body: {e}")) - })?; - - let retry_data = Arc::new(HttpRetryData { - body, - headers: self.headers.clone(), - endpoint: self.collector_endpoint.to_string(), - }); - - retry_with_backoff( - Tokio, - policy, - classify_http_export_error, + self.export_http_with_retry( + batch, + OtlpHttpClient::build_trace_export_body, "HttpTracesClient.Export", - || async { - // Get client - let client = self - .client - .lock() - .map_err(|e| HttpExportError::new(500, format!("Mutex lock failed: {e}")))? - .as_ref() - .ok_or_else(|| { - HttpExportError::new(500, "Exporter already shutdown".to_string()) - })? - .clone(); - - // Build HTTP request - let mut request_builder = http::Request::builder() - .method(Method::POST) - .uri(&retry_data.endpoint) - .header(CONTENT_TYPE, content_type); - - if let Some(encoding) = content_encoding { - request_builder = request_builder.header("Content-Encoding", encoding); - } - - let mut request = request_builder - .body(retry_data.body.clone().into()) - .map_err(|e| { - HttpExportError::new(400, format!("Failed to build HTTP request: {e}")) - })?; - - for (k, v) in retry_data.headers.iter() { - request.headers_mut().insert(k.clone(), v.clone()); - } - - let request_uri = request.uri().to_string(); - otel_debug!(name: "HttpTracesClient.ExportStarted"); - - // Send request - let response = client.send_bytes(request).await.map_err(|e| { - HttpExportError::new(0, format!("Network error: {e:?}")) // Network error - })?; - - let status_code = response.status().as_u16(); - let retry_after = response - .headers() - .get("retry-after") - .and_then(|v| v.to_str().ok()) - .map(|s| s.to_string()); - - if !response.status().is_success() { - let message = format!( - "HTTP export failed. Url: {}, Status: {}, Response: {:?}", - request_uri, - status_code, - response.body() - ); - return Err(match retry_after { - Some(retry_after) => { - HttpExportError::with_retry_after(status_code, retry_after, message) - } - None => HttpExportError::new(status_code, message), - }); - } - - otel_debug!(name: "HttpTracesClient.ExportSucceeded"); - Ok(()) - }, ) .await - .map_err(|e| OTelSdkError::InternalFailure(e.message)) - } - - #[cfg(not(feature = "http-retry"))] - async fn export(&self, batch: Vec) -> OTelSdkResult { - let client = match self - .client - .lock() - .map_err(|e| OTelSdkError::InternalFailure(format!("Mutex lock failed: {e}"))) - .and_then(|g| match &*g { - Some(client) => Ok(Arc::clone(client)), - _ => Err(OTelSdkError::AlreadyShutdown), - }) { - Ok(client) => client, - Err(err) => return Err(err), - }; - - let (body, content_type, content_encoding) = match self.build_trace_export_body(batch) { - Ok(result) => result, - Err(e) => return Err(OTelSdkError::InternalFailure(e.to_string())), - }; - - let mut request_builder = http::Request::builder() - .method(Method::POST) - .uri(&self.collector_endpoint) - .header(CONTENT_TYPE, content_type); - - if let Some(encoding) = content_encoding { - request_builder = request_builder.header("Content-Encoding", encoding); - } - - let mut request = match request_builder.body(body.into()) { - Ok(req) => req, - Err(e) => return Err(OTelSdkError::InternalFailure(e.to_string())), - }; - - for (k, v) in self.headers.iter() { - request.headers_mut().insert(k.clone(), v.clone()); - } - - let request_uri = request.uri().to_string(); - otel_debug!(name: "HttpTracesClient.ExportStarted"); - let response = client - .send_bytes(request) - .await - .map_err(|e| OTelSdkError::InternalFailure(format!("{e:?}")))?; - - if !response.status().is_success() { - let error = format!( - "OpenTelemetry trace export failed. Url: {}, Status Code: {}, Response: {:?}", - request_uri, - response.status().as_u16(), - response.body() - ); - otel_debug!(name: "HttpTracesClient.ExportFailed", error = &error); - return Err(OTelSdkError::InternalFailure(error)); - } - - otel_debug!(name: "HttpTracesClient.ExportSucceeded"); - Ok(()) } fn shutdown(&mut self) -> OTelSdkResult { From f0fe377ecce1fcf1d9d87d1a1f94287a7f69e9c8 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Fri, 5 Sep 2025 11:11:20 +0200 Subject: [PATCH 20/27] chore: configurable retry policy with default --- opentelemetry-otlp/src/exporter/http/mod.rs | 118 ++++++++++++++++-- opentelemetry-otlp/src/exporter/tonic/logs.rs | 17 +-- .../src/exporter/tonic/metrics.rs | 17 +-- opentelemetry-otlp/src/exporter/tonic/mod.rs | 76 +++++++++-- .../src/exporter/tonic/trace.rs | 17 +-- opentelemetry-otlp/src/lib.rs | 3 + opentelemetry-sdk/src/retry.rs | 2 +- 7 files changed, 206 insertions(+), 44 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/http/mod.rs b/opentelemetry-otlp/src/exporter/http/mod.rs index d04c8badc7..296ebb6f3f 100644 --- a/opentelemetry-otlp/src/exporter/http/mod.rs +++ b/opentelemetry-otlp/src/exporter/http/mod.rs @@ -25,7 +25,7 @@ use std::time::Duration; #[cfg(feature = "http-retry")] use crate::retry_classification::http::classify_http_error; #[cfg(feature = "http-retry")] -use opentelemetry_sdk::retry::RetryErrorType; +use opentelemetry_sdk::retry::{RetryErrorType, RetryPolicy}; // Shared HTTP retry functionality /// HTTP-specific error wrapper for retry classification @@ -111,6 +111,10 @@ pub struct HttpConfig { /// The compression algorithm to use when communicating with the OTLP endpoint. compression: Option, + + /// The retry policy to use for HTTP requests. + #[cfg(feature = "http-retry")] + retry_policy: Option, } /// Configuration for the OTLP HTTP exporter. @@ -282,6 +286,8 @@ impl HttpExporterBuilder { self.exporter_config.protocol, timeout, compression, + #[cfg(feature = "http-retry")] + self.http_config.retry_policy.take(), )) } @@ -361,6 +367,8 @@ pub(crate) struct OtlpHttpClient { protocol: Protocol, _timeout: Duration, compression: Option, + #[cfg(feature = "http-retry")] + retry_policy: RetryPolicy, #[allow(dead_code)] // would be removed once we support set_resource for metrics and traces. resource: opentelemetry_proto::transform::common::tonic::ResourceAttributesWithSchema, @@ -379,16 +387,9 @@ impl OtlpHttpClient { { #[cfg(feature = "http-retry")] { - use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; + use opentelemetry_sdk::retry::retry_with_backoff; use opentelemetry_sdk::runtime::Tokio; - let policy = RetryPolicy { - max_retries: 3, - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, - }; - // Build request body once before retry loop let (body, content_type, content_encoding) = build_body_fn(self, data) .map_err(opentelemetry_sdk::error::OTelSdkError::InternalFailure)?; @@ -401,7 +402,7 @@ impl OtlpHttpClient { retry_with_backoff( Tokio, - policy, + self.retry_policy.clone(), classify_http_export_error, operation_name, || async { @@ -544,6 +545,7 @@ impl OtlpHttpClient { protocol: Protocol, timeout: Duration, compression: Option, + #[cfg(feature = "http-retry")] retry_policy: Option, ) -> Self { OtlpHttpClient { client: Mutex::new(Some(client)), @@ -552,6 +554,13 @@ impl OtlpHttpClient { protocol, _timeout: timeout, compression, + #[cfg(feature = "http-retry")] + retry_policy: retry_policy.unwrap_or(RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }), resource: ResourceAttributesWithSchema::default(), } } @@ -722,6 +731,10 @@ pub trait WithHttpConfig { /// Set the compression algorithm to use when communicating with the collector. fn with_compression(self, compression: crate::Compression) -> Self; + + /// Set the retry policy for HTTP requests. + #[cfg(feature = "http-retry")] + fn with_retry_policy(self, policy: RetryPolicy) -> Self; } impl WithHttpConfig for B { @@ -746,6 +759,12 @@ impl WithHttpConfig for B { self.http_client_config().compression = Some(compression); self } + + #[cfg(feature = "http-retry")] + fn with_retry_policy(mut self, policy: RetryPolicy) -> Self { + self.http_client_config().retry_policy = Some(policy); + self + } } #[cfg(test)] @@ -991,6 +1010,8 @@ mod tests { client: None, headers: Some(initial_headers), compression: None, + #[cfg(feature = "http-retry")] + retry_policy: None, }, exporter_config: crate::ExportConfig::default(), }; @@ -1057,6 +1078,8 @@ mod tests { crate::Protocol::HttpBinary, std::time::Duration::from_secs(10), Some(crate::Compression::Gzip), + #[cfg(feature = "http-retry")] + None, ); // Test with some sample data @@ -1088,6 +1111,8 @@ mod tests { crate::Protocol::HttpBinary, std::time::Duration::from_secs(10), Some(crate::Compression::Zstd), + #[cfg(feature = "http-retry")] + None, ); // Test with some sample data @@ -1116,6 +1141,8 @@ mod tests { crate::Protocol::HttpBinary, std::time::Duration::from_secs(10), None, // No compression + #[cfg(feature = "http-retry")] + None, ); let body = vec![1, 2, 3, 4]; @@ -1137,6 +1164,8 @@ mod tests { crate::Protocol::HttpBinary, std::time::Duration::from_secs(10), Some(crate::Compression::Gzip), + #[cfg(feature = "http-retry")] + None, ); let body = vec![1, 2, 3, 4]; @@ -1159,6 +1188,8 @@ mod tests { crate::Protocol::HttpBinary, std::time::Duration::from_secs(10), Some(crate::Compression::Zstd), + #[cfg(feature = "http-retry")] + None, ); let body = vec![1, 2, 3, 4]; @@ -1221,6 +1252,8 @@ mod tests { protocol, std::time::Duration::from_secs(10), compression, + #[cfg(feature = "http-retry")] + None, ) } @@ -1454,5 +1487,70 @@ mod tests { Err(ExporterBuildError::UnsupportedCompressionAlgorithm(_)) )); } + + #[cfg(feature = "http-retry")] + #[test] + fn test_with_retry_policy() { + use super::super::HttpExporterBuilder; + use crate::WithHttpConfig; + use opentelemetry_sdk::retry::RetryPolicy; + + let custom_policy = RetryPolicy { + max_retries: 5, + initial_delay_ms: 200, + max_delay_ms: 3200, + jitter_ms: 50, + }; + + let builder = HttpExporterBuilder::default().with_retry_policy(custom_policy); + + // Verify the retry policy was set + let retry_policy = builder.http_config.retry_policy.as_ref().unwrap(); + assert_eq!(retry_policy.max_retries, 5); + assert_eq!(retry_policy.initial_delay_ms, 200); + assert_eq!(retry_policy.max_delay_ms, 3200); + assert_eq!(retry_policy.jitter_ms, 50); + } + + #[cfg(feature = "http-retry")] + #[test] + fn test_default_retry_policy_when_none_configured() { + let client = create_test_client(crate::Protocol::HttpBinary, None); + + // Verify default values are used + assert_eq!(client.retry_policy.max_retries, 3); + assert_eq!(client.retry_policy.initial_delay_ms, 100); + assert_eq!(client.retry_policy.max_delay_ms, 1600); + assert_eq!(client.retry_policy.jitter_ms, 100); + } + + #[cfg(feature = "http-retry")] + #[test] + fn test_custom_retry_policy_used() { + use opentelemetry_sdk::retry::RetryPolicy; + + let custom_policy = RetryPolicy { + max_retries: 7, + initial_delay_ms: 500, + max_delay_ms: 5000, + jitter_ms: 200, + }; + + let client = OtlpHttpClient::new( + std::sync::Arc::new(MockHttpClient), + "http://localhost:4318".parse().unwrap(), + HashMap::new(), + crate::Protocol::HttpBinary, + std::time::Duration::from_secs(10), + None, + Some(custom_policy), + ); + + // Verify custom values are used + assert_eq!(client.retry_policy.max_retries, 7); + assert_eq!(client.retry_policy.initial_delay_ms, 500); + assert_eq!(client.retry_policy.max_delay_ms, 5000); + assert_eq!(client.retry_policy.jitter_ms, 200); + } } } diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index fee1244686..650962ba28 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -20,6 +20,7 @@ use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicLogsClient { inner: Mutex>, + retry_policy: RetryPolicy, #[allow(dead_code)] // would be removed once we support set_resource for metrics. resource: opentelemetry_proto::transform::common::tonic::ResourceAttributesWithSchema, @@ -41,6 +42,7 @@ impl TonicLogsClient { channel: Channel, interceptor: BoxInterceptor, compression: Option, + retry_policy: Option, ) -> Self { let mut client = LogsServiceClient::new(channel); if let Some(compression) = compression { @@ -56,6 +58,12 @@ impl TonicLogsClient { client, interceptor, })), + retry_policy: retry_policy.unwrap_or(RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }), resource: Default::default(), } } @@ -63,18 +71,11 @@ impl TonicLogsClient { impl LogExporter for TonicLogsClient { async fn export(&self, batch: LogBatch<'_>) -> OTelSdkResult { - let policy = RetryPolicy { - max_retries: 3, - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, - }; - let batch = Arc::new(batch); match retry_with_backoff( Tokio, - policy, + self.retry_policy.clone(), classify_tonic_status, "TonicLogsClient.Export", || async { diff --git a/opentelemetry-otlp/src/exporter/tonic/metrics.rs b/opentelemetry-otlp/src/exporter/tonic/metrics.rs index d6244b4bae..223df3fd2f 100644 --- a/opentelemetry-otlp/src/exporter/tonic/metrics.rs +++ b/opentelemetry-otlp/src/exporter/tonic/metrics.rs @@ -18,6 +18,7 @@ use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicMetricsClient { inner: Mutex>, + retry_policy: RetryPolicy, } struct ClientInner { @@ -36,6 +37,7 @@ impl TonicMetricsClient { channel: Channel, interceptor: BoxInterceptor, compression: Option, + retry_policy: Option, ) -> Self { let mut client = MetricsServiceClient::new(channel); if let Some(compression) = compression { @@ -51,22 +53,21 @@ impl TonicMetricsClient { client, interceptor, })), + retry_policy: retry_policy.unwrap_or(RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }), } } } impl MetricsClient for TonicMetricsClient { async fn export(&self, metrics: &ResourceMetrics) -> OTelSdkResult { - let policy = RetryPolicy { - max_retries: 3, - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, - }; - match retry_with_backoff( Tokio, - policy, + self.retry_policy.clone(), classify_tonic_status, "TonicMetricsClient.Export", || async { diff --git a/opentelemetry-otlp/src/exporter/tonic/mod.rs b/opentelemetry-otlp/src/exporter/tonic/mod.rs index 6986107602..ee8f11f2a1 100644 --- a/opentelemetry-otlp/src/exporter/tonic/mod.rs +++ b/opentelemetry-otlp/src/exporter/tonic/mod.rs @@ -16,6 +16,9 @@ use super::{resolve_timeout, ExporterBuildError}; use crate::exporter::Compression; use crate::{ExportConfig, OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_HEADERS}; +#[cfg(feature = "grpc-tonic")] +use opentelemetry_sdk::retry::RetryPolicy; + #[cfg(feature = "logs")] pub(crate) mod logs; @@ -40,6 +43,9 @@ pub struct TonicConfig { pub(crate) compression: Option, pub(crate) channel: Option, pub(crate) interceptor: Option, + /// The retry policy to use for gRPC requests. + #[cfg(feature = "grpc-tonic")] + pub(crate) retry_policy: Option, } impl TryFrom for tonic::codec::CompressionEncoding { @@ -132,6 +138,8 @@ impl Default for TonicExporterBuilder { compression: None, channel: Option::default(), interceptor: Option::default(), + #[cfg(feature = "grpc-tonic")] + retry_policy: None, }, exporter_config: ExportConfig { protocol: crate::Protocol::Grpc, @@ -150,7 +158,7 @@ impl TonicExporterBuilder { signal_timeout_var: &str, signal_compression_var: &str, signal_headers_var: &str, - ) -> Result<(Channel, BoxInterceptor, Option), ExporterBuildError> { + ) -> Result<(Channel, BoxInterceptor, Option, Option), ExporterBuildError> { let compression = self.resolve_compression(signal_compression_var)?; let (headers_from_env, headers_for_logging) = parse_headers_from_env(signal_headers_var); @@ -181,9 +189,15 @@ impl TonicExporterBuilder { None => BoxInterceptor(Box::new(add_metadata)), }; + // Get retry policy before consuming self + #[cfg(feature = "grpc-tonic")] + let retry_policy = self.tonic_config.retry_policy.clone(); + // If a custom channel was provided, use that channel instead of creating one if let Some(channel) = self.tonic_config.channel { - return Ok((channel, interceptor, compression)); + return Ok((channel, interceptor, compression, + #[cfg(feature = "grpc-tonic")] retry_policy, + #[cfg(not(feature = "grpc-tonic"))] None)); } let config = self.exporter_config; @@ -211,7 +225,9 @@ impl TonicExporterBuilder { let channel = endpoint.timeout(timeout).connect_lazy(); otel_debug!(name: "TonicChannelBuilt", endpoint = endpoint_clone, timeout_in_millisecs = timeout.as_millis(), compression = format!("{:?}", compression), headers = format!("{:?}", headers_for_logging)); - Ok((channel, interceptor, compression)) + Ok((channel, interceptor, compression, + #[cfg(feature = "grpc-tonic")] retry_policy, + #[cfg(not(feature = "grpc-tonic"))] None)) } fn resolve_endpoint(default_endpoint_var: &str, provided_endpoint: Option) -> String { @@ -249,14 +265,14 @@ impl TonicExporterBuilder { otel_debug!(name: "LogsTonicChannelBuilding"); - let (channel, interceptor, compression) = self.build_channel( + let (channel, interceptor, compression, retry_policy) = self.build_channel( crate::logs::OTEL_EXPORTER_OTLP_LOGS_ENDPOINT, crate::logs::OTEL_EXPORTER_OTLP_LOGS_TIMEOUT, crate::logs::OTEL_EXPORTER_OTLP_LOGS_COMPRESSION, crate::logs::OTEL_EXPORTER_OTLP_LOGS_HEADERS, )?; - let client = TonicLogsClient::new(channel, interceptor, compression); + let client = TonicLogsClient::new(channel, interceptor, compression, retry_policy); Ok(crate::logs::LogExporter::from_tonic(client)) } @@ -272,14 +288,14 @@ impl TonicExporterBuilder { otel_debug!(name: "MetricsTonicChannelBuilding"); - let (channel, interceptor, compression) = self.build_channel( + let (channel, interceptor, compression, retry_policy) = self.build_channel( crate::metric::OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, crate::metric::OTEL_EXPORTER_OTLP_METRICS_TIMEOUT, crate::metric::OTEL_EXPORTER_OTLP_METRICS_COMPRESSION, crate::metric::OTEL_EXPORTER_OTLP_METRICS_HEADERS, )?; - let client = TonicMetricsClient::new(channel, interceptor, compression); + let client = TonicMetricsClient::new(channel, interceptor, compression, retry_policy); Ok(MetricExporter::from_tonic(client, temporality)) } @@ -291,14 +307,14 @@ impl TonicExporterBuilder { otel_debug!(name: "TracesTonicChannelBuilding"); - let (channel, interceptor, compression) = self.build_channel( + let (channel, interceptor, compression, retry_policy) = self.build_channel( crate::span::OTEL_EXPORTER_OTLP_TRACES_ENDPOINT, crate::span::OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, crate::span::OTEL_EXPORTER_OTLP_TRACES_COMPRESSION, crate::span::OTEL_EXPORTER_OTLP_TRACES_HEADERS, )?; - let client = TonicTracesClient::new(channel, interceptor, compression); + let client = TonicTracesClient::new(channel, interceptor, compression, retry_policy); Ok(crate::SpanExporter::from_tonic(client)) } @@ -475,6 +491,10 @@ pub trait WithTonicConfig { fn with_interceptor(self, interceptor: I) -> Self where I: tonic::service::Interceptor + Clone + Send + Sync + 'static; + + /// Set the retry policy for gRPC requests. + #[cfg(feature = "grpc-tonic")] + fn with_retry_policy(self, policy: RetryPolicy) -> Self; } impl WithTonicConfig for B { @@ -516,6 +536,12 @@ impl WithTonicConfig for B { self.tonic_config().interceptor = Some(BoxInterceptor(Box::new(interceptor))); self } + + #[cfg(feature = "grpc-tonic")] + fn with_retry_policy(mut self, policy: RetryPolicy) -> Self { + self.tonic_config().retry_policy = Some(policy); + self + } } #[cfg(test)] @@ -748,4 +774,36 @@ mod tests { assert_eq!(url, "http://localhost:4317"); }); } + + #[cfg(feature = "grpc-tonic")] + #[test] + fn test_with_retry_policy() { + use crate::WithTonicConfig; + use opentelemetry_sdk::retry::RetryPolicy; + + let custom_policy = RetryPolicy { + max_retries: 5, + initial_delay_ms: 200, + max_delay_ms: 3200, + jitter_ms: 50, + }; + + let builder = TonicExporterBuilder::default().with_retry_policy(custom_policy); + + // Verify the retry policy was set + let retry_policy = builder.tonic_config.retry_policy.as_ref().unwrap(); + assert_eq!(retry_policy.max_retries, 5); + assert_eq!(retry_policy.initial_delay_ms, 200); + assert_eq!(retry_policy.max_delay_ms, 3200); + assert_eq!(retry_policy.jitter_ms, 50); + } + + #[cfg(feature = "grpc-tonic")] + #[test] + fn test_default_retry_policy_when_none_configured() { + // This test requires us to create a tonic client, but we can't easily do that without + // a channel in a unit test. The default behavior is tested implicitly in integration tests. + let builder = TonicExporterBuilder::default(); + assert!(builder.tonic_config.retry_policy.is_none()); + } } diff --git a/opentelemetry-otlp/src/exporter/tonic/trace.rs b/opentelemetry-otlp/src/exporter/tonic/trace.rs index a5239b4602..5e32660998 100644 --- a/opentelemetry-otlp/src/exporter/tonic/trace.rs +++ b/opentelemetry-otlp/src/exporter/tonic/trace.rs @@ -22,6 +22,7 @@ use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicTracesClient { inner: Option, + retry_policy: RetryPolicy, #[allow(dead_code)] // would be removed once we support set_resource for metrics. resource: opentelemetry_proto::transform::common::tonic::ResourceAttributesWithSchema, @@ -43,6 +44,7 @@ impl TonicTracesClient { channel: Channel, interceptor: BoxInterceptor, compression: Option, + retry_policy: Option, ) -> Self { let mut client = TraceServiceClient::new(channel); if let Some(compression) = compression { @@ -58,6 +60,12 @@ impl TonicTracesClient { client, interceptor: Mutex::new(interceptor), }), + retry_policy: retry_policy.unwrap_or(RetryPolicy { + max_retries: 3, + initial_delay_ms: 100, + max_delay_ms: 1600, + jitter_ms: 100, + }), resource: Default::default(), } } @@ -65,18 +73,11 @@ impl TonicTracesClient { impl SpanExporter for TonicTracesClient { async fn export(&self, batch: Vec) -> OTelSdkResult { - let policy = RetryPolicy { - max_retries: 3, - initial_delay_ms: 100, - max_delay_ms: 1600, - jitter_ms: 100, - }; - let batch = Arc::new(batch); match retry_with_backoff( Tokio, - policy, + self.retry_policy.clone(), classify_tonic_status, "TonicTracesClient.Export", || async { diff --git a/opentelemetry-otlp/src/lib.rs b/opentelemetry-otlp/src/lib.rs index e8de66d273..ac32e38732 100644 --- a/opentelemetry-otlp/src/lib.rs +++ b/opentelemetry-otlp/src/lib.rs @@ -409,6 +409,9 @@ pub use crate::exporter::{ OTEL_EXPORTER_OTLP_TIMEOUT_DEFAULT, }; +#[cfg(feature = "http-retry")] +pub use opentelemetry_sdk::retry::RetryPolicy; + /// Type to indicate the builder does not have a client set. #[derive(Debug, Default, Clone)] pub struct NoExporterBuilderSet; diff --git a/opentelemetry-sdk/src/retry.rs b/opentelemetry-sdk/src/retry.rs index 48f4cf49a7..17d1b183b0 100644 --- a/opentelemetry-sdk/src/retry.rs +++ b/opentelemetry-sdk/src/retry.rs @@ -32,7 +32,7 @@ pub enum RetryErrorType { } /// Configuration for retry policy. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct RetryPolicy { /// Maximum number of retry attempts. pub max_retries: usize, From ff93dd518a93856f0568b560f109708baaf60b7e Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Fri, 5 Sep 2025 11:29:32 +0200 Subject: [PATCH 21/27] chore: modify tonic exporters so we don't _need_ the async runtime if we don't want retry --- opentelemetry-otlp/Cargo.toml | 8 +- opentelemetry-otlp/src/exporter/http/mod.rs | 62 +++++----- opentelemetry-otlp/src/exporter/tonic/logs.rs | 11 +- .../src/exporter/tonic/metrics.rs | 11 +- opentelemetry-otlp/src/exporter/tonic/mod.rs | 108 +++++++++++++++--- .../src/exporter/tonic/trace.rs | 11 +- opentelemetry-otlp/src/lib.rs | 4 +- 7 files changed, 152 insertions(+), 63 deletions(-) diff --git a/opentelemetry-otlp/Cargo.toml b/opentelemetry-otlp/Cargo.toml index 10939cac5f..95b8c27166 100644 --- a/opentelemetry-otlp/Cargo.toml +++ b/opentelemetry-otlp/Cargo.toml @@ -70,10 +70,13 @@ serialize = ["serde", "serde_json"] default = ["http-proto", "reqwest-blocking-client", "trace", "metrics", "logs", "internal-logs"] # grpc using tonic -grpc-tonic = ["tonic", "tonic-types", "prost", "http", "tokio", "opentelemetry-proto/gen-tonic", "opentelemetry_sdk/rt-tokio", "opentelemetry_sdk/experimental_async_runtime"] +grpc-tonic = ["tonic", "tonic-types", "prost", "http", "tokio", "opentelemetry-proto/gen-tonic"] gzip-tonic = ["tonic/gzip"] zstd-tonic = ["tonic/zstd"] +# grpc with retry support +experimental-grpc-retry = ["grpc-tonic", "opentelemetry_sdk/experimental_async_runtime", "opentelemetry_sdk/rt-tokio"] + # http compression gzip-http = ["flate2"] zstd-http = ["zstd"] @@ -85,8 +88,7 @@ tls-webpki-roots = ["tls", "tonic/tls-webpki-roots"] http-proto = ["prost", "opentelemetry-http", "opentelemetry-proto/gen-tonic-messages", "http", "trace", "metrics"] # http with retry support. -# What should we do with this? We need the async_runtime. gRPC exporters already need it. -http-retry = ["opentelemetry_sdk/experimental_async_runtime", "opentelemetry_sdk/rt-tokio", "tokio"] +experimental-http-retry = ["opentelemetry_sdk/experimental_async_runtime", "opentelemetry_sdk/rt-tokio", "tokio"] http-json = ["serde_json", "prost", "opentelemetry-http", "opentelemetry-proto/gen-tonic-messages", "opentelemetry-proto/with-serde", "http", "trace", "metrics"] reqwest-blocking-client = ["reqwest/blocking", "opentelemetry-http/reqwest-blocking"] diff --git a/opentelemetry-otlp/src/exporter/http/mod.rs b/opentelemetry-otlp/src/exporter/http/mod.rs index 296ebb6f3f..0f57a0292e 100644 --- a/opentelemetry-otlp/src/exporter/http/mod.rs +++ b/opentelemetry-otlp/src/exporter/http/mod.rs @@ -22,18 +22,18 @@ use std::str::FromStr; use std::sync::{Arc, Mutex}; use std::time::Duration; -#[cfg(feature = "http-retry")] +#[cfg(feature = "experimental-http-retry")] use crate::retry_classification::http::classify_http_error; -#[cfg(feature = "http-retry")] +#[cfg(feature = "experimental-http-retry")] use opentelemetry_sdk::retry::{RetryErrorType, RetryPolicy}; // Shared HTTP retry functionality /// HTTP-specific error wrapper for retry classification #[derive(Debug)] pub(crate) struct HttpExportError { - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] pub status_code: u16, - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] pub retry_after: Option, pub message: String, } @@ -42,9 +42,9 @@ impl HttpExportError { /// Create a new HttpExportError without retry-after header pub(crate) fn new(_status_code: u16, message: String) -> Self { Self { - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] status_code: _status_code, - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] retry_after: None, message, } @@ -57,16 +57,16 @@ impl HttpExportError { message: String, ) -> Self { Self { - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] status_code: _status_code, - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] retry_after: Some(_retry_after), message, } } } -#[cfg(feature = "http-retry")] +#[cfg(feature = "experimental-http-retry")] /// Classify HTTP export errors for retry decisions pub(crate) fn classify_http_export_error(error: &HttpExportError) -> RetryErrorType { classify_http_error(error.status_code, error.retry_after.as_deref()) @@ -113,7 +113,7 @@ pub struct HttpConfig { compression: Option, /// The retry policy to use for HTTP requests. - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] retry_policy: Option, } @@ -286,7 +286,7 @@ impl HttpExporterBuilder { self.exporter_config.protocol, timeout, compression, - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] self.http_config.retry_policy.take(), )) } @@ -367,7 +367,7 @@ pub(crate) struct OtlpHttpClient { protocol: Protocol, _timeout: Duration, compression: Option, - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] retry_policy: RetryPolicy, #[allow(dead_code)] // would be removed once we support set_resource for metrics and traces. @@ -385,7 +385,7 @@ impl OtlpHttpClient { where F: Fn(&Self, T) -> Result<(Vec, &'static str, Option<&'static str>), String>, { - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] { use opentelemetry_sdk::retry::retry_with_backoff; use opentelemetry_sdk::runtime::Tokio; @@ -419,7 +419,7 @@ impl OtlpHttpClient { .map_err(|e| opentelemetry_sdk::error::OTelSdkError::InternalFailure(e.message)) } - #[cfg(not(feature = "http-retry"))] + #[cfg(not(feature = "experimental-http-retry"))] { let (body, content_type, content_encoding) = build_body_fn(self, data) .map_err(opentelemetry_sdk::error::OTelSdkError::InternalFailure)?; @@ -545,7 +545,7 @@ impl OtlpHttpClient { protocol: Protocol, timeout: Duration, compression: Option, - #[cfg(feature = "http-retry")] retry_policy: Option, + #[cfg(feature = "experimental-http-retry")] retry_policy: Option, ) -> Self { OtlpHttpClient { client: Mutex::new(Some(client)), @@ -554,7 +554,7 @@ impl OtlpHttpClient { protocol, _timeout: timeout, compression, - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] retry_policy: retry_policy.unwrap_or(RetryPolicy { max_retries: 3, initial_delay_ms: 100, @@ -733,7 +733,7 @@ pub trait WithHttpConfig { fn with_compression(self, compression: crate::Compression) -> Self; /// Set the retry policy for HTTP requests. - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] fn with_retry_policy(self, policy: RetryPolicy) -> Self; } @@ -760,7 +760,7 @@ impl WithHttpConfig for B { self } - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] fn with_retry_policy(mut self, policy: RetryPolicy) -> Self { self.http_client_config().retry_policy = Some(policy); self @@ -1010,7 +1010,7 @@ mod tests { client: None, headers: Some(initial_headers), compression: None, - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] retry_policy: None, }, exporter_config: crate::ExportConfig::default(), @@ -1078,7 +1078,7 @@ mod tests { crate::Protocol::HttpBinary, std::time::Duration::from_secs(10), Some(crate::Compression::Gzip), - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] None, ); @@ -1111,7 +1111,7 @@ mod tests { crate::Protocol::HttpBinary, std::time::Duration::from_secs(10), Some(crate::Compression::Zstd), - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] None, ); @@ -1141,7 +1141,7 @@ mod tests { crate::Protocol::HttpBinary, std::time::Duration::from_secs(10), None, // No compression - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] None, ); @@ -1164,7 +1164,7 @@ mod tests { crate::Protocol::HttpBinary, std::time::Duration::from_secs(10), Some(crate::Compression::Gzip), - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] None, ); @@ -1188,7 +1188,7 @@ mod tests { crate::Protocol::HttpBinary, std::time::Duration::from_secs(10), Some(crate::Compression::Zstd), - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] None, ); @@ -1252,7 +1252,7 @@ mod tests { protocol, std::time::Duration::from_secs(10), compression, - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] None, ) } @@ -1488,7 +1488,7 @@ mod tests { )); } - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] #[test] fn test_with_retry_policy() { use super::super::HttpExporterBuilder; @@ -1512,11 +1512,11 @@ mod tests { assert_eq!(retry_policy.jitter_ms, 50); } - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] #[test] fn test_default_retry_policy_when_none_configured() { let client = create_test_client(crate::Protocol::HttpBinary, None); - + // Verify default values are used assert_eq!(client.retry_policy.max_retries, 3); assert_eq!(client.retry_policy.initial_delay_ms, 100); @@ -1524,11 +1524,11 @@ mod tests { assert_eq!(client.retry_policy.jitter_ms, 100); } - #[cfg(feature = "http-retry")] + #[cfg(feature = "experimental-http-retry")] #[test] fn test_custom_retry_policy_used() { use opentelemetry_sdk::retry::RetryPolicy; - + let custom_policy = RetryPolicy { max_retries: 7, initial_delay_ms: 500, @@ -1545,7 +1545,7 @@ mod tests { None, Some(custom_policy), ); - + // Verify custom values are used assert_eq!(client.retry_policy.max_retries, 7); assert_eq!(client.retry_policy.initial_delay_ms, 500); diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index 650962ba28..071df38389 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -14,8 +14,8 @@ use opentelemetry_proto::transform::logs::tonic::group_logs_by_resource_and_scop use super::BoxInterceptor; -use crate::retry_classification::grpc::classify_tonic_status; -use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; +use opentelemetry_sdk::retry::RetryPolicy; +#[cfg(feature = "experimental-grpc-retry")] use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicLogsClient { @@ -73,10 +73,13 @@ impl LogExporter for TonicLogsClient { async fn export(&self, batch: LogBatch<'_>) -> OTelSdkResult { let batch = Arc::new(batch); - match retry_with_backoff( + match super::tonic_retry_with_backoff( + #[cfg(feature = "experimental-grpc-retry")] Tokio, + #[cfg(not(feature = "experimental-grpc-retry"))] + (), self.retry_policy.clone(), - classify_tonic_status, + crate::retry_classification::grpc::classify_tonic_status, "TonicLogsClient.Export", || async { let batch_clone = Arc::clone(&batch); diff --git a/opentelemetry-otlp/src/exporter/tonic/metrics.rs b/opentelemetry-otlp/src/exporter/tonic/metrics.rs index 223df3fd2f..aaa50bfb39 100644 --- a/opentelemetry-otlp/src/exporter/tonic/metrics.rs +++ b/opentelemetry-otlp/src/exporter/tonic/metrics.rs @@ -12,8 +12,8 @@ use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Chann use super::BoxInterceptor; use crate::metric::MetricsClient; -use crate::retry_classification::grpc::classify_tonic_status; -use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; +use opentelemetry_sdk::retry::RetryPolicy; +#[cfg(feature = "experimental-grpc-retry")] use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicMetricsClient { @@ -65,10 +65,13 @@ impl TonicMetricsClient { impl MetricsClient for TonicMetricsClient { async fn export(&self, metrics: &ResourceMetrics) -> OTelSdkResult { - match retry_with_backoff( + match super::tonic_retry_with_backoff( + #[cfg(feature = "experimental-grpc-retry")] Tokio, + #[cfg(not(feature = "experimental-grpc-retry"))] + (), self.retry_policy.clone(), - classify_tonic_status, + crate::retry_classification::grpc::classify_tonic_status, "TonicMetricsClient.Export", || async { // Execute the export operation diff --git a/opentelemetry-otlp/src/exporter/tonic/mod.rs b/opentelemetry-otlp/src/exporter/tonic/mod.rs index ee8f11f2a1..0ff72b5970 100644 --- a/opentelemetry-otlp/src/exporter/tonic/mod.rs +++ b/opentelemetry-otlp/src/exporter/tonic/mod.rs @@ -16,8 +16,23 @@ use super::{resolve_timeout, ExporterBuildError}; use crate::exporter::Compression; use crate::{ExportConfig, OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_HEADERS}; +#[cfg(all( + feature = "experimental-grpc-retry", + any(feature = "trace", feature = "metrics", feature = "logs") +))] +use opentelemetry_sdk::retry::retry_with_backoff; #[cfg(feature = "grpc-tonic")] use opentelemetry_sdk::retry::RetryPolicy; +#[cfg(all( + feature = "experimental-grpc-retry", + any(feature = "trace", feature = "metrics", feature = "logs") +))] +use opentelemetry_sdk::runtime::Runtime; +#[cfg(all( + feature = "grpc-tonic", + any(feature = "trace", feature = "metrics", feature = "logs") +))] +use std::future::Future; #[cfg(feature = "logs")] pub(crate) mod logs; @@ -44,7 +59,7 @@ pub struct TonicConfig { pub(crate) channel: Option, pub(crate) interceptor: Option, /// The retry policy to use for gRPC requests. - #[cfg(feature = "grpc-tonic")] + #[cfg(feature = "experimental-grpc-retry")] pub(crate) retry_policy: Option, } @@ -138,7 +153,7 @@ impl Default for TonicExporterBuilder { compression: None, channel: Option::default(), interceptor: Option::default(), - #[cfg(feature = "grpc-tonic")] + #[cfg(feature = "experimental-grpc-retry")] retry_policy: None, }, exporter_config: ExportConfig { @@ -158,7 +173,15 @@ impl TonicExporterBuilder { signal_timeout_var: &str, signal_compression_var: &str, signal_headers_var: &str, - ) -> Result<(Channel, BoxInterceptor, Option, Option), ExporterBuildError> { + ) -> Result< + ( + Channel, + BoxInterceptor, + Option, + Option, + ), + ExporterBuildError, + > { let compression = self.resolve_compression(signal_compression_var)?; let (headers_from_env, headers_for_logging) = parse_headers_from_env(signal_headers_var); @@ -190,14 +213,20 @@ impl TonicExporterBuilder { }; // Get retry policy before consuming self - #[cfg(feature = "grpc-tonic")] + #[cfg(feature = "experimental-grpc-retry")] let retry_policy = self.tonic_config.retry_policy.clone(); - + // If a custom channel was provided, use that channel instead of creating one if let Some(channel) = self.tonic_config.channel { - return Ok((channel, interceptor, compression, - #[cfg(feature = "grpc-tonic")] retry_policy, - #[cfg(not(feature = "grpc-tonic"))] None)); + return Ok(( + channel, + interceptor, + compression, + #[cfg(feature = "experimental-grpc-retry")] + retry_policy, + #[cfg(not(feature = "experimental-grpc-retry"))] + None, + )); } let config = self.exporter_config; @@ -225,9 +254,15 @@ impl TonicExporterBuilder { let channel = endpoint.timeout(timeout).connect_lazy(); otel_debug!(name: "TonicChannelBuilt", endpoint = endpoint_clone, timeout_in_millisecs = timeout.as_millis(), compression = format!("{:?}", compression), headers = format!("{:?}", headers_for_logging)); - Ok((channel, interceptor, compression, - #[cfg(feature = "grpc-tonic")] retry_policy, - #[cfg(not(feature = "grpc-tonic"))] None)) + Ok(( + channel, + interceptor, + compression, + #[cfg(feature = "experimental-grpc-retry")] + retry_policy, + #[cfg(not(feature = "experimental-grpc-retry"))] + None, + )) } fn resolve_endpoint(default_endpoint_var: &str, provided_endpoint: Option) -> String { @@ -320,6 +355,49 @@ impl TonicExporterBuilder { } } +/// Wrapper for retry functionality in tonic exporters. +/// Provides a unified call path that either uses retry_with_backoff when experimental-grpc-retry +/// feature is enabled, or executes the operation once when it's not. +#[cfg(all( + feature = "grpc-tonic", + feature = "experimental-grpc-retry", + any(feature = "trace", feature = "metrics", feature = "logs") +))] +async fn tonic_retry_with_backoff( + runtime: R, + policy: RetryPolicy, + classify_fn: fn(&tonic::Status) -> opentelemetry_sdk::retry::RetryErrorType, + operation_name: &'static str, + operation: F, +) -> Result +where + R: Runtime, + F: Fn() -> Fut, + Fut: Future>, +{ + retry_with_backoff(runtime, policy, classify_fn, operation_name, operation).await +} + +/// Provides a unified call path when experimental-grpc-retry is not enabled - just executes the operation once. +#[cfg(all( + feature = "grpc-tonic", + not(feature = "experimental-grpc-retry"), + any(feature = "trace", feature = "metrics", feature = "logs") +))] +async fn tonic_retry_with_backoff( + _runtime: (), + _policy: RetryPolicy, + _classify_fn: fn(&tonic::Status) -> opentelemetry_sdk::retry::RetryErrorType, + _operation_name: &'static str, + operation: F, +) -> Result +where + F: Fn() -> Fut, + Fut: Future>, +{ + operation().await +} + fn merge_metadata_with_headers_from_env( metadata: MetadataMap, headers_from_env: HeaderMap, @@ -493,7 +571,7 @@ pub trait WithTonicConfig { I: tonic::service::Interceptor + Clone + Send + Sync + 'static; /// Set the retry policy for gRPC requests. - #[cfg(feature = "grpc-tonic")] + #[cfg(feature = "experimental-grpc-retry")] fn with_retry_policy(self, policy: RetryPolicy) -> Self; } @@ -537,7 +615,7 @@ impl WithTonicConfig for B { self } - #[cfg(feature = "grpc-tonic")] + #[cfg(feature = "experimental-grpc-retry")] fn with_retry_policy(mut self, policy: RetryPolicy) -> Self { self.tonic_config().retry_policy = Some(policy); self @@ -775,7 +853,7 @@ mod tests { }); } - #[cfg(feature = "grpc-tonic")] + #[cfg(feature = "experimental-grpc-retry")] #[test] fn test_with_retry_policy() { use crate::WithTonicConfig; @@ -798,7 +876,7 @@ mod tests { assert_eq!(retry_policy.jitter_ms, 50); } - #[cfg(feature = "grpc-tonic")] + #[cfg(feature = "experimental-grpc-retry")] #[test] fn test_default_retry_policy_when_none_configured() { // This test requires us to create a tonic client, but we can't easily do that without diff --git a/opentelemetry-otlp/src/exporter/tonic/trace.rs b/opentelemetry-otlp/src/exporter/tonic/trace.rs index 5e32660998..c426ff7fcc 100644 --- a/opentelemetry-otlp/src/exporter/tonic/trace.rs +++ b/opentelemetry-otlp/src/exporter/tonic/trace.rs @@ -16,8 +16,8 @@ use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Chann use super::BoxInterceptor; -use crate::retry_classification::grpc::classify_tonic_status; -use opentelemetry_sdk::retry::{retry_with_backoff, RetryPolicy}; +use opentelemetry_sdk::retry::RetryPolicy; +#[cfg(feature = "experimental-grpc-retry")] use opentelemetry_sdk::runtime::Tokio; pub(crate) struct TonicTracesClient { @@ -75,10 +75,13 @@ impl SpanExporter for TonicTracesClient { async fn export(&self, batch: Vec) -> OTelSdkResult { let batch = Arc::new(batch); - match retry_with_backoff( + match super::tonic_retry_with_backoff( + #[cfg(feature = "experimental-grpc-retry")] Tokio, + #[cfg(not(feature = "experimental-grpc-retry"))] + (), self.retry_policy.clone(), - classify_tonic_status, + crate::retry_classification::grpc::classify_tonic_status, "TonicTracesClient.Export", || async { let batch_clone = Arc::clone(&batch); diff --git a/opentelemetry-otlp/src/lib.rs b/opentelemetry-otlp/src/lib.rs index ac32e38732..1c178e79fd 100644 --- a/opentelemetry-otlp/src/lib.rs +++ b/opentelemetry-otlp/src/lib.rs @@ -366,7 +366,7 @@ mod metric; #[cfg(any(feature = "http-proto", feature = "http-json", feature = "grpc-tonic"))] mod span; -#[cfg(any(feature = "grpc-tonic", feature = "http-retry"))] +#[cfg(any(feature = "grpc-tonic", feature = "experimental-http-retry"))] pub mod retry_classification; pub use crate::exporter::Compression; @@ -409,7 +409,7 @@ pub use crate::exporter::{ OTEL_EXPORTER_OTLP_TIMEOUT_DEFAULT, }; -#[cfg(feature = "http-retry")] +#[cfg(feature = "experimental-http-retry")] pub use opentelemetry_sdk::retry::RetryPolicy; /// Type to indicate the builder does not have a client set. From 9bf0161ddd86f86ea00345112cb70bf7e0646586 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Fri, 5 Sep 2025 13:05:17 +0200 Subject: [PATCH 22/27] chore: fix missing dep --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 010ce42b48..5fbac9ddf4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ thiserror = { version = "2", default-features = false } tonic = { version = "0.14.1", default-features = false } tonic-prost-build = "0.14.1" tonic-prost = "0.14.1" +tonic-types = "0.14.1" tokio = { version = "1", default-features = false } tokio-stream = "0.1" # Using `tracing 0.1.40` because 0.1.39 (which is yanked) introduces the ability to set event names in macros, From d449457809b4db1c125978629731420b3f9a63d8 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Mon, 6 Oct 2025 09:21:24 +0200 Subject: [PATCH 23/27] chore: move retry from sdk down to otlp for now --- opentelemetry-otlp/src/exporter/http/mod.rs | 8 +++--- opentelemetry-otlp/src/exporter/tonic/logs.rs | 2 +- .../src/exporter/tonic/metrics.rs | 2 +- opentelemetry-otlp/src/exporter/tonic/mod.rs | 10 +++---- .../src/exporter/tonic/trace.rs | 2 +- opentelemetry-otlp/src/lib.rs | 6 +++- .../src/retry.rs | 28 +++++++++++-------- .../src/retry_classification.rs | 4 +-- opentelemetry-sdk/src/lib.rs | 2 -- 9 files changed, 35 insertions(+), 29 deletions(-) rename {opentelemetry-sdk => opentelemetry-otlp}/src/retry.rs (94%) diff --git a/opentelemetry-otlp/src/exporter/http/mod.rs b/opentelemetry-otlp/src/exporter/http/mod.rs index 0f57a0292e..65e702c40f 100644 --- a/opentelemetry-otlp/src/exporter/http/mod.rs +++ b/opentelemetry-otlp/src/exporter/http/mod.rs @@ -25,7 +25,7 @@ use std::time::Duration; #[cfg(feature = "experimental-http-retry")] use crate::retry_classification::http::classify_http_error; #[cfg(feature = "experimental-http-retry")] -use opentelemetry_sdk::retry::{RetryErrorType, RetryPolicy}; +use crate::retry::{RetryErrorType, RetryPolicy}; // Shared HTTP retry functionality /// HTTP-specific error wrapper for retry classification @@ -387,7 +387,7 @@ impl OtlpHttpClient { { #[cfg(feature = "experimental-http-retry")] { - use opentelemetry_sdk::retry::retry_with_backoff; + use crate::retry::retry_with_backoff; use opentelemetry_sdk::runtime::Tokio; // Build request body once before retry loop @@ -1493,7 +1493,7 @@ mod tests { fn test_with_retry_policy() { use super::super::HttpExporterBuilder; use crate::WithHttpConfig; - use opentelemetry_sdk::retry::RetryPolicy; + use crate::retry::RetryPolicy; let custom_policy = RetryPolicy { max_retries: 5, @@ -1527,7 +1527,7 @@ mod tests { #[cfg(feature = "experimental-http-retry")] #[test] fn test_custom_retry_policy_used() { - use opentelemetry_sdk::retry::RetryPolicy; + use crate::retry::RetryPolicy; let custom_policy = RetryPolicy { max_retries: 7, diff --git a/opentelemetry-otlp/src/exporter/tonic/logs.rs b/opentelemetry-otlp/src/exporter/tonic/logs.rs index 071df38389..679e211a3d 100644 --- a/opentelemetry-otlp/src/exporter/tonic/logs.rs +++ b/opentelemetry-otlp/src/exporter/tonic/logs.rs @@ -14,7 +14,7 @@ use opentelemetry_proto::transform::logs::tonic::group_logs_by_resource_and_scop use super::BoxInterceptor; -use opentelemetry_sdk::retry::RetryPolicy; +use crate::retry::RetryPolicy; #[cfg(feature = "experimental-grpc-retry")] use opentelemetry_sdk::runtime::Tokio; diff --git a/opentelemetry-otlp/src/exporter/tonic/metrics.rs b/opentelemetry-otlp/src/exporter/tonic/metrics.rs index aaa50bfb39..58470f9d30 100644 --- a/opentelemetry-otlp/src/exporter/tonic/metrics.rs +++ b/opentelemetry-otlp/src/exporter/tonic/metrics.rs @@ -12,7 +12,7 @@ use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Chann use super::BoxInterceptor; use crate::metric::MetricsClient; -use opentelemetry_sdk::retry::RetryPolicy; +use crate::retry::RetryPolicy; #[cfg(feature = "experimental-grpc-retry")] use opentelemetry_sdk::runtime::Tokio; diff --git a/opentelemetry-otlp/src/exporter/tonic/mod.rs b/opentelemetry-otlp/src/exporter/tonic/mod.rs index 0ff72b5970..d99e298c06 100644 --- a/opentelemetry-otlp/src/exporter/tonic/mod.rs +++ b/opentelemetry-otlp/src/exporter/tonic/mod.rs @@ -20,9 +20,9 @@ use crate::{ExportConfig, OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_HEADER feature = "experimental-grpc-retry", any(feature = "trace", feature = "metrics", feature = "logs") ))] -use opentelemetry_sdk::retry::retry_with_backoff; +use crate::retry::retry_with_backoff; #[cfg(feature = "grpc-tonic")] -use opentelemetry_sdk::retry::RetryPolicy; +use crate::retry::RetryPolicy; #[cfg(all( feature = "experimental-grpc-retry", any(feature = "trace", feature = "metrics", feature = "logs") @@ -366,7 +366,7 @@ impl TonicExporterBuilder { async fn tonic_retry_with_backoff( runtime: R, policy: RetryPolicy, - classify_fn: fn(&tonic::Status) -> opentelemetry_sdk::retry::RetryErrorType, + classify_fn: fn(&tonic::Status) -> crate::retry::RetryErrorType, operation_name: &'static str, operation: F, ) -> Result @@ -387,7 +387,7 @@ where async fn tonic_retry_with_backoff( _runtime: (), _policy: RetryPolicy, - _classify_fn: fn(&tonic::Status) -> opentelemetry_sdk::retry::RetryErrorType, + _classify_fn: fn(&tonic::Status) -> crate::retry::RetryErrorType, _operation_name: &'static str, operation: F, ) -> Result @@ -857,7 +857,7 @@ mod tests { #[test] fn test_with_retry_policy() { use crate::WithTonicConfig; - use opentelemetry_sdk::retry::RetryPolicy; + use crate::retry::RetryPolicy; let custom_policy = RetryPolicy { max_retries: 5, diff --git a/opentelemetry-otlp/src/exporter/tonic/trace.rs b/opentelemetry-otlp/src/exporter/tonic/trace.rs index c426ff7fcc..cfead52e58 100644 --- a/opentelemetry-otlp/src/exporter/tonic/trace.rs +++ b/opentelemetry-otlp/src/exporter/tonic/trace.rs @@ -16,7 +16,7 @@ use tonic::{codegen::CompressionEncoding, service::Interceptor, transport::Chann use super::BoxInterceptor; -use opentelemetry_sdk::retry::RetryPolicy; +use crate::retry::RetryPolicy; #[cfg(feature = "experimental-grpc-retry")] use opentelemetry_sdk::runtime::Tokio; diff --git a/opentelemetry-otlp/src/lib.rs b/opentelemetry-otlp/src/lib.rs index 1c178e79fd..7bcc535939 100644 --- a/opentelemetry-otlp/src/lib.rs +++ b/opentelemetry-otlp/src/lib.rs @@ -369,6 +369,10 @@ mod span; #[cfg(any(feature = "grpc-tonic", feature = "experimental-http-retry"))] pub mod retry_classification; +/// Retry logic for exporting telemetry data. +#[cfg(any(feature = "grpc-tonic", feature = "experimental-http-retry"))] +pub mod retry; + pub use crate::exporter::Compression; pub use crate::exporter::ExportConfig; pub use crate::exporter::ExporterBuildError; @@ -410,7 +414,7 @@ pub use crate::exporter::{ }; #[cfg(feature = "experimental-http-retry")] -pub use opentelemetry_sdk::retry::RetryPolicy; +pub use retry::RetryPolicy; /// Type to indicate the builder does not have a client set. #[derive(Debug, Default, Clone)] diff --git a/opentelemetry-sdk/src/retry.rs b/opentelemetry-otlp/src/retry.rs similarity index 94% rename from opentelemetry-sdk/src/retry.rs rename to opentelemetry-otlp/src/retry.rs index 17d1b183b0..23f8327c4d 100644 --- a/opentelemetry-sdk/src/retry.rs +++ b/opentelemetry-otlp/src/retry.rs @@ -8,16 +8,17 @@ //! retries. The function uses error classification to determine retry behavior and can honor //! server-provided throttling hints. -#[cfg(feature = "experimental_async_runtime")] +#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] use opentelemetry::otel_warn; -#[cfg(feature = "experimental_async_runtime")] +#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] use std::future::Future; +use std::hash::DefaultHasher; use std::time::Duration; -#[cfg(feature = "experimental_async_runtime")] +#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] use std::time::SystemTime; -#[cfg(feature = "experimental_async_runtime")] -use crate::runtime::Runtime; +#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] +use opentelemetry_sdk::runtime::Runtime; /// Classification of errors for retry purposes. #[derive(Debug, Clone, PartialEq)] @@ -46,11 +47,11 @@ pub struct RetryPolicy { /// A runtime stub for when experimental_async_runtime is not enabled. /// This allows retry policy to be configured but no actual retries occur. -#[cfg(not(feature = "experimental_async_runtime"))] +#[cfg(not(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry")))] #[derive(Debug, Clone, Default)] pub struct NoOpRuntime; -#[cfg(not(feature = "experimental_async_runtime"))] +#[cfg(not(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry")))] impl NoOpRuntime { /// Creates a new no-op runtime. pub fn new() -> Self { @@ -59,13 +60,16 @@ impl NoOpRuntime { } // Generates a random jitter value up to max_jitter -#[cfg(feature = "experimental_async_runtime")] +#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] fn generate_jitter(max_jitter: u64) -> u64 { let now = SystemTime::now(); let nanos = now .duration_since(SystemTime::UNIX_EPOCH) .unwrap() .subsec_nanos(); + + let hasher = DefaultHasher::default(); + nanos as u64 % (max_jitter + 1) } @@ -86,7 +90,7 @@ fn generate_jitter(max_jitter: u64) -> u64 { /// /// A `Result` containing the operation's result or an error if max retries are reached /// or a non-retryable error occurs. -#[cfg(feature = "experimental_async_runtime")] +#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] pub async fn retry_with_backoff( runtime: R, policy: RetryPolicy, @@ -147,7 +151,7 @@ where /// No-op retry function for when experimental_async_runtime is not enabled. /// This function will execute the operation exactly once without any retries. -#[cfg(not(feature = "experimental_async_runtime"))] +#[cfg(not(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry")))] pub async fn retry_with_backoff( _runtime: R, _policy: RetryPolicy, @@ -163,10 +167,10 @@ where operation().await } -#[cfg(all(test, feature = "experimental_async_runtime", feature = "rt-tokio"))] +#[cfg(all(test, feature = "experimental-grpc-retry"))] mod tests { use super::*; - use crate::runtime::Tokio; + use opentelemetry_sdk::runtime::Tokio; use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Duration; use tokio::time::timeout; diff --git a/opentelemetry-otlp/src/retry_classification.rs b/opentelemetry-otlp/src/retry_classification.rs index 06a8d914ad..ca0f47100a 100644 --- a/opentelemetry-otlp/src/retry_classification.rs +++ b/opentelemetry-otlp/src/retry_classification.rs @@ -4,7 +4,7 @@ //! supporting server-provided throttling hints like HTTP Retry-After headers and //! gRPC RetryInfo metadata. -use opentelemetry_sdk::retry::RetryErrorType; +use crate::retry::RetryErrorType; use std::time::Duration; #[cfg(feature = "grpc-tonic")] @@ -222,7 +222,7 @@ mod tests { #[cfg(feature = "grpc-tonic")] mod grpc_tests { use crate::retry_classification::grpc::classify_tonic_status; - use opentelemetry_sdk::retry::RetryErrorType; + use crate::retry::RetryErrorType; use tonic_types::{ErrorDetails, StatusExt}; #[test] diff --git a/opentelemetry-sdk/src/lib.rs b/opentelemetry-sdk/src/lib.rs index 1143e333f8..893b5ab97d 100644 --- a/opentelemetry-sdk/src/lib.rs +++ b/opentelemetry-sdk/src/lib.rs @@ -168,5 +168,3 @@ impl From> for InMemoryExporterError { } } -/// Retry logic for exporting telemetry data. -pub mod retry; From 661bbe3c406db532580f5300f3aa9b052fbbb00b Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Mon, 6 Oct 2025 09:42:32 +0200 Subject: [PATCH 24/27] chore: use DefaultHasher to spread out nano time jitter --- opentelemetry-otlp/src/exporter/http/mod.rs | 6 +- opentelemetry-otlp/src/exporter/tonic/mod.rs | 2 +- opentelemetry-otlp/src/retry.rs | 60 ++++++++++++++----- .../src/retry_classification.rs | 2 +- opentelemetry-sdk/src/lib.rs | 1 - 5 files changed, 50 insertions(+), 21 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/http/mod.rs b/opentelemetry-otlp/src/exporter/http/mod.rs index 65e702c40f..b6336e37ae 100644 --- a/opentelemetry-otlp/src/exporter/http/mod.rs +++ b/opentelemetry-otlp/src/exporter/http/mod.rs @@ -22,10 +22,10 @@ use std::str::FromStr; use std::sync::{Arc, Mutex}; use std::time::Duration; -#[cfg(feature = "experimental-http-retry")] -use crate::retry_classification::http::classify_http_error; #[cfg(feature = "experimental-http-retry")] use crate::retry::{RetryErrorType, RetryPolicy}; +#[cfg(feature = "experimental-http-retry")] +use crate::retry_classification::http::classify_http_error; // Shared HTTP retry functionality /// HTTP-specific error wrapper for retry classification @@ -1492,8 +1492,8 @@ mod tests { #[test] fn test_with_retry_policy() { use super::super::HttpExporterBuilder; - use crate::WithHttpConfig; use crate::retry::RetryPolicy; + use crate::WithHttpConfig; let custom_policy = RetryPolicy { max_retries: 5, diff --git a/opentelemetry-otlp/src/exporter/tonic/mod.rs b/opentelemetry-otlp/src/exporter/tonic/mod.rs index d99e298c06..15c098f60b 100644 --- a/opentelemetry-otlp/src/exporter/tonic/mod.rs +++ b/opentelemetry-otlp/src/exporter/tonic/mod.rs @@ -856,8 +856,8 @@ mod tests { #[cfg(feature = "experimental-grpc-retry")] #[test] fn test_with_retry_policy() { - use crate::WithTonicConfig; use crate::retry::RetryPolicy; + use crate::WithTonicConfig; let custom_policy = RetryPolicy { max_retries: 5, diff --git a/opentelemetry-otlp/src/retry.rs b/opentelemetry-otlp/src/retry.rs index 23f8327c4d..939e44503b 100644 --- a/opentelemetry-otlp/src/retry.rs +++ b/opentelemetry-otlp/src/retry.rs @@ -8,16 +8,32 @@ //! retries. The function uses error classification to determine retry behavior and can honor //! server-provided throttling hints. -#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] +#[cfg(any( + feature = "experimental-grpc-retry", + feature = "experimental-http-retry" +))] use opentelemetry::otel_warn; -#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] +#[cfg(any( + feature = "experimental-grpc-retry", + feature = "experimental-http-retry" +))] use std::future::Future; -use std::hash::DefaultHasher; +#[cfg(any( + feature = "experimental-grpc-retry", + feature = "experimental-http-retry" +))] +use std::hash::{DefaultHasher, Hasher}; use std::time::Duration; -#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] +#[cfg(any( + feature = "experimental-grpc-retry", + feature = "experimental-http-retry" +))] use std::time::SystemTime; -#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] +#[cfg(any( + feature = "experimental-grpc-retry", + feature = "experimental-http-retry" +))] use opentelemetry_sdk::runtime::Runtime; /// Classification of errors for retry purposes. @@ -47,11 +63,17 @@ pub struct RetryPolicy { /// A runtime stub for when experimental_async_runtime is not enabled. /// This allows retry policy to be configured but no actual retries occur. -#[cfg(not(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry")))] +#[cfg(not(any( + feature = "experimental-grpc-retry", + feature = "experimental-http-retry" +)))] #[derive(Debug, Clone, Default)] pub struct NoOpRuntime; -#[cfg(not(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry")))] +#[cfg(not(any( + feature = "experimental-grpc-retry", + feature = "experimental-http-retry" +)))] impl NoOpRuntime { /// Creates a new no-op runtime. pub fn new() -> Self { @@ -60,17 +82,19 @@ impl NoOpRuntime { } // Generates a random jitter value up to max_jitter -#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] +#[cfg(any( + feature = "experimental-grpc-retry", + feature = "experimental-http-retry" +))] fn generate_jitter(max_jitter: u64) -> u64 { - let now = SystemTime::now(); - let nanos = now + let nanos = SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap() .subsec_nanos(); - let hasher = DefaultHasher::default(); - - nanos as u64 % (max_jitter + 1) + let mut hasher = DefaultHasher::default(); + hasher.write_u32(nanos); + hasher.finish() % (max_jitter + 1) } /// Retries the given operation with exponential backoff, jitter, and error classification. @@ -90,7 +114,10 @@ fn generate_jitter(max_jitter: u64) -> u64 { /// /// A `Result` containing the operation's result or an error if max retries are reached /// or a non-retryable error occurs. -#[cfg(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry"))] +#[cfg(any( + feature = "experimental-grpc-retry", + feature = "experimental-http-retry" +))] pub async fn retry_with_backoff( runtime: R, policy: RetryPolicy, @@ -151,7 +178,10 @@ where /// No-op retry function for when experimental_async_runtime is not enabled. /// This function will execute the operation exactly once without any retries. -#[cfg(not(any(feature = "experimental-grpc-retry", feature = "experimental-http-retry")))] +#[cfg(not(any( + feature = "experimental-grpc-retry", + feature = "experimental-http-retry" +)))] pub async fn retry_with_backoff( _runtime: R, _policy: RetryPolicy, diff --git a/opentelemetry-otlp/src/retry_classification.rs b/opentelemetry-otlp/src/retry_classification.rs index ca0f47100a..a69757d5de 100644 --- a/opentelemetry-otlp/src/retry_classification.rs +++ b/opentelemetry-otlp/src/retry_classification.rs @@ -221,8 +221,8 @@ mod tests { // Tests for gRPC error classification using public interface #[cfg(feature = "grpc-tonic")] mod grpc_tests { - use crate::retry_classification::grpc::classify_tonic_status; use crate::retry::RetryErrorType; + use crate::retry_classification::grpc::classify_tonic_status; use tonic_types::{ErrorDetails, StatusExt}; #[test] diff --git a/opentelemetry-sdk/src/lib.rs b/opentelemetry-sdk/src/lib.rs index 893b5ab97d..8f8e5a8653 100644 --- a/opentelemetry-sdk/src/lib.rs +++ b/opentelemetry-sdk/src/lib.rs @@ -167,4 +167,3 @@ impl From> for InMemoryExporterError { InMemoryExporterError::InternalFailure(format!("Mutex poison error: {err}")) } } - From 4e29dfaf898bb49c19ee0a204bc35669312a1fb9 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Mon, 6 Oct 2025 09:49:41 +0200 Subject: [PATCH 25/27] chore: impl date parsing --- opentelemetry-otlp/Cargo.toml | 3 +- .../src/retry_classification.rs | 57 ++++++++++++++++--- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/opentelemetry-otlp/Cargo.toml b/opentelemetry-otlp/Cargo.toml index 95b8c27166..5a030ecaff 100644 --- a/opentelemetry-otlp/Cargo.toml +++ b/opentelemetry-otlp/Cargo.toml @@ -40,6 +40,7 @@ tokio = { workspace = true, features = ["sync", "rt"], optional = true } reqwest = { workspace = true, optional = true } http = { workspace = true, optional = true } +httpdate = { version = "1.0.3", optional = true } serde = { workspace = true, features = ["derive"], optional = true } thiserror = { workspace = true } serde_json = { workspace = true, optional = true } @@ -88,7 +89,7 @@ tls-webpki-roots = ["tls", "tonic/tls-webpki-roots"] http-proto = ["prost", "opentelemetry-http", "opentelemetry-proto/gen-tonic-messages", "http", "trace", "metrics"] # http with retry support. -experimental-http-retry = ["opentelemetry_sdk/experimental_async_runtime", "opentelemetry_sdk/rt-tokio", "tokio"] +experimental-http-retry = ["opentelemetry_sdk/experimental_async_runtime", "opentelemetry_sdk/rt-tokio", "tokio", "httpdate"] http-json = ["serde_json", "prost", "opentelemetry-http", "opentelemetry-proto/gen-tonic-messages", "opentelemetry-proto/with-serde", "http", "trace", "metrics"] reqwest-blocking-client = ["reqwest/blocking", "opentelemetry-http/reqwest-blocking"] diff --git a/opentelemetry-otlp/src/retry_classification.rs b/opentelemetry-otlp/src/retry_classification.rs index a69757d5de..c65a7685d4 100644 --- a/opentelemetry-otlp/src/retry_classification.rs +++ b/opentelemetry-otlp/src/retry_classification.rs @@ -5,7 +5,6 @@ //! gRPC RetryInfo metadata. use crate::retry::RetryErrorType; -use std::time::Duration; #[cfg(feature = "grpc-tonic")] use tonic; @@ -14,8 +13,10 @@ use tonic; use tonic_types::StatusExt; /// HTTP-specific error classification with Retry-After header support. +#[cfg(feature = "experimental-http-retry")] pub mod http { use super::*; + use std::time::Duration; /// Classifies HTTP errors based on status code and headers. /// @@ -76,14 +77,17 @@ pub mod http { } /// Parses HTTP date format and returns delay in seconds from now. - /// - /// This is a simplified parser for the most common HTTP date format. - /// TODO - should we use a library here? fn parse_http_date_to_delay(date_str: &str) -> Result { - // For now, return error - would need proper HTTP date parsing - // This could be implemented with chrono or similar - let _ = date_str; - Err(()) + use std::time::SystemTime; + + // Try parse the date; if we fail, propagate an () error up to the caller. + let target_time = httpdate::parse_http_date(date_str).map_err(|_| ())?; + + let now = SystemTime::now(); + let delay = target_time + .duration_since(now) + .unwrap_or(std::time::Duration::ZERO); + Ok(delay.as_secs()) } } @@ -160,6 +164,7 @@ pub mod grpc { #[cfg(test)] mod tests { use super::*; + use std::time::Duration; // Tests for HTTP error classification mod http_tests { @@ -216,6 +221,42 @@ mod tests { assert_eq!(classify_http_error(200, None), RetryErrorType::Retryable); assert_eq!(classify_http_error(300, None), RetryErrorType::Retryable); } + + #[test] + #[cfg(feature = "experimental-http-retry")] + fn test_http_429_with_retry_after_valid_date() { + use std::time::SystemTime; + + // Create a time 30 seconds in the future + let future_time = SystemTime::now() + Duration::from_secs(30); + let date_str = httpdate::fmt_http_date(future_time); + let result = classify_http_error(429, Some(&date_str)); + match result { + RetryErrorType::Throttled(duration) => { + let secs = duration.as_secs(); + assert!( + (29..=30).contains(&secs), + "Expected ~30 seconds, got {}", + secs + ); + } + _ => panic!("Expected Throttled, got {:?}", result), + } + } + + #[test] + #[cfg(feature = "experimental-http-retry")] + fn test_http_429_with_retry_after_invalid_date() { + let result = classify_http_error(429, Some("Not a valid date")); + assert_eq!(result, RetryErrorType::Retryable); // Falls back to retryable + } + + #[test] + #[cfg(feature = "experimental-http-retry")] + fn test_http_429_with_retry_after_malformed_date() { + let result = classify_http_error(429, Some("Sun, 99 Nov 9999 99:99:99 GMT")); + assert_eq!(result, RetryErrorType::Retryable); // Falls back to retryable + } } // Tests for gRPC error classification using public interface From e49c4a70bcd98e0db7515f5077dfeda9640f53a7 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Mon, 6 Oct 2025 09:49:41 +0200 Subject: [PATCH 26/27] feat: add a 'no-async' runtime --- opentelemetry-sdk/src/runtime.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/opentelemetry-sdk/src/runtime.rs b/opentelemetry-sdk/src/runtime.rs index f82ac5943a..8ae9c61df7 100644 --- a/opentelemetry-sdk/src/runtime.rs +++ b/opentelemetry-sdk/src/runtime.rs @@ -245,3 +245,31 @@ impl RuntimeChannel for TokioCurrentThread { ) } } + +/// Runtime implementation for synchronous execution environments. +/// +/// This runtime can be used when executing in a non-async environment. +/// The runtime methods will perform their operations synchronously. +#[cfg(feature = "experimental_async_runtime")] +#[derive(Debug, Clone, Copy)] +pub struct NoAsync; + +#[cfg(feature = "experimental_async_runtime")] +impl Runtime for NoAsync { + fn spawn(&self, future: F) + where + F: Future + Send + 'static, + { + std::thread::spawn(move || { + futures_executor::block_on(future); + }); + } + + // Needed because async fn would borrow `self`, violating the `'static` requirement. + #[allow(clippy::manual_async_fn)] + fn delay(&self, duration: Duration) -> impl Future + Send + 'static { + async move { + std::thread::sleep(duration); + } + } +} From 9ba3a06df952d4888009764bebe665281f9e45b7 Mon Sep 17 00:00:00 2001 From: Scott Gerring Date: Wed, 8 Oct 2025 11:11:35 +0100 Subject: [PATCH 27/27] chore: select correct runtime --- opentelemetry-otlp/src/exporter/http/mod.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/opentelemetry-otlp/src/exporter/http/mod.rs b/opentelemetry-otlp/src/exporter/http/mod.rs index b6336e37ae..d4e8938983 100644 --- a/opentelemetry-otlp/src/exporter/http/mod.rs +++ b/opentelemetry-otlp/src/exporter/http/mod.rs @@ -388,7 +388,6 @@ impl OtlpHttpClient { #[cfg(feature = "experimental-http-retry")] { use crate::retry::retry_with_backoff; - use opentelemetry_sdk::runtime::Tokio; // Build request body once before retry loop let (body, content_type, content_encoding) = build_body_fn(self, data) @@ -400,8 +399,17 @@ impl OtlpHttpClient { endpoint: self.collector_endpoint.to_string(), }); + // Select runtime based on HTTP client feature - if we're using + // one without Tokio, we don't need or want the Tokio async blocking + // behaviour. + #[cfg(feature = "reqwest-blocking-client")] + let runtime = opentelemetry_sdk::runtime::NoAsync; + + #[cfg(not(feature = "reqwest-blocking-client"))] + let runtime = opentelemetry_sdk::runtime::Tokio; + retry_with_backoff( - Tokio, + runtime, self.retry_policy.clone(), classify_http_export_error, operation_name,