From 1ad10a95ec1111a8ab973ba9d1a3c1f25cb98e65 Mon Sep 17 00:00:00 2001 From: Stu Hood Date: Fri, 24 Jun 2022 11:58:29 -0700 Subject: [PATCH] Assorted remote metrics fixes. (#15914) * Fix the `RemoteStoreReadBlobTimeMicros` metric, which was not including the actual fetch time. * Add the relevant unit suffix to time-based metrics which don't have one. * Move `RemoteStoreBlobBytes{Up,Down}loaded` to observation metrics, in order to record histograms. * Additionally, report a `sum` value for histograms. * Simplify the "time to first byte" calculation by using an `Option` for the start time. [ci skip-build-wheels] --- src/python/pants/goal/stats_aggregator.py | 1 + src/rust/engine/fs/store/src/remote.rs | 25 ++++++++----------- .../engine/process_execution/src/remote.rs | 7 +++--- src/rust/engine/workunit_store/src/metrics.rs | 12 ++++----- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/python/pants/goal/stats_aggregator.py b/src/python/pants/goal/stats_aggregator.py index 2574bfd8346..481dc0852b5 100644 --- a/src/python/pants/goal/stats_aggregator.py +++ b/src/python/pants/goal/stats_aggregator.py @@ -146,6 +146,7 @@ def __call__( f" mean: {histogram.get_mean_value():.3f}\n" f" std dev: {histogram.get_stddev():.3f}\n" f" total observations: {histogram.total_count}\n" + f" sum: {int(histogram.get_mean_value() * histogram.total_count)}\n" f"{percentile_to_vals}" ) diff --git a/src/rust/engine/fs/store/src/remote.rs b/src/rust/engine/fs/store/src/remote.rs index e0ffbffe73c..e4db49cc353 100644 --- a/src/rust/engine/fs/store/src/remote.rs +++ b/src/rust/engine/fs/store/src/remote.rs @@ -22,7 +22,7 @@ use remexec::{ ServerCapabilities, }; use tonic::{Code, Request, Status}; -use workunit_store::{in_workunit, Metric, ObservationMetric}; +use workunit_store::{in_workunit, ObservationMetric}; use crate::StoreError; @@ -324,7 +324,7 @@ impl ByteStore { |workunit| async move { let result = result_future.await; if result.is_ok() { - workunit.increment_counter(Metric::RemoteStoreBlobBytesUploaded, len as u64); + workunit.record_observation(ObservationMetric::RemoteStoreBlobBytesUploaded, len as u64); } result }, @@ -356,7 +356,7 @@ impl ByteStore { let mut client = self.byte_stream_client.as_ref().clone(); let result_future = async move { - let start_time = Instant::now(); + let mut start_opt = Some(Instant::now()); let stream_result = client .read({ @@ -380,22 +380,17 @@ impl ByteStore { }; let read_result_closure = async { - let mut got_first_response = false; let mut buf = BytesMut::with_capacity(digest.size_bytes); while let Some(response) = stream.next().await { // Record the observed time to receive the first response for this read. - if !got_first_response { - got_first_response = true; - + if let Some(start) = start_opt.take() { if let Some(workunit_store_handle) = workunit_store::get_workunit_store_handle() { - let timing: Result = Instant::now() - .duration_since(start_time) - .as_micros() - .try_into(); + let timing: Result = + Instant::now().duration_since(start).as_micros().try_into(); if let Ok(obs) = timing { workunit_store_handle .store - .record_observation(ObservationMetric::RemoteStoreTimeToFirstByte, obs); + .record_observation(ObservationMetric::RemoteStoreTimeToFirstByteMicros, obs); } } } @@ -429,14 +424,14 @@ impl ByteStore { Level::Trace, desc = Some(workunit_desc), |workunit| async move { + let result = result_future.await; workunit.record_observation( ObservationMetric::RemoteStoreReadBlobTimeMicros, start.elapsed().as_micros() as u64, ); - let result = result_future.await; if result.is_ok() { - workunit.increment_counter( - Metric::RemoteStoreBlobBytesDownloaded, + workunit.record_observation( + ObservationMetric::RemoteStoreBlobBytesDownloaded, digest.size_bytes as u64, ); } diff --git a/src/rust/engine/process_execution/src/remote.rs b/src/rust/engine/process_execution/src/remote.rs index b537bc67386..36b9d9d86da 100644 --- a/src/rust/engine/process_execution/src/remote.rs +++ b/src/rust/engine/process_execution/src/remote.rs @@ -216,9 +216,10 @@ impl CommandRunner { .as_micros() .try_into(); if let Ok(obs) = timing { - context - .workunit_store - .record_observation(ObservationMetric::RemoteExecutionRPCFirstResponseTime, obs); + context.workunit_store.record_observation( + ObservationMetric::RemoteExecutionRPCFirstResponseTimeMicros, + obs, + ); } } diff --git a/src/rust/engine/workunit_store/src/metrics.rs b/src/rust/engine/workunit_store/src/metrics.rs index b29a7e43e20..2ebb57d48df 100644 --- a/src/rust/engine/workunit_store/src/metrics.rs +++ b/src/rust/engine/workunit_store/src/metrics.rs @@ -49,10 +49,6 @@ pub enum Metric { RemoteExecutionSuccess, RemoteExecutionTimeouts, RemoteStoreMissingDigest, - /// Total number of bytes of blobs downloaded from a remote CAS. - RemoteStoreBlobBytesDownloaded, - /// Total number of bytes of blobs uploaded to a remote CAS. - RemoteStoreBlobBytesUploaded, /// Number of times that we backtracked due to missing digests. BacktrackAttempts, } @@ -71,9 +67,13 @@ pub enum ObservationMetric { LocalStoreReadBlobSize, LocalStoreReadBlobTimeMicros, RemoteProcessTimeRunMs, - RemoteExecutionRPCFirstResponseTime, - RemoteStoreTimeToFirstByte, + RemoteExecutionRPCFirstResponseTimeMicros, + RemoteStoreTimeToFirstByteMicros, RemoteStoreReadBlobTimeMicros, + /// Total number of bytes of blobs downloaded from a remote CAS. + RemoteStoreBlobBytesDownloaded, + /// Total number of bytes of blobs uploaded to a remote CAS. + RemoteStoreBlobBytesUploaded, /// The time saved (in milliseconds) thanks to a local cache hit instead of running the process /// directly. LocalCacheTimeSavedMs,