Skip to content

Commit

Permalink
pageserver: layer count & size metrics (#8410)
Browse files Browse the repository at this point in the history
## Problem

We lack insight into:
- How much of a tenant's physical size is image vs. delta layers
- Average sizes of image vs. delta layers
- Total layer counts per timeline, indicating size of index_part object

As well as general observability love, this is motivated by
#6738, where we need to
define some sensible thresholds for storage amplification, and using
total physical size may not work well (if someone does a lot of DROPs
then it's legitimate for the physical-synthetic ratio to be huge), but
the ratio between image layer size and delta layer size may be a better
indicator of whether we're generating unreasonable quantities of image
layers.

## Summary of changes

- Add pageserver_layer_bytes and pageserver_layer_count metrics,
labelled by timeline and `kind` (delta or image)
- Add & subtract these with LayerInner's lifetime.

I'm intentionally avoiding using a generic metric RAII guard object, to
avoid bloating LayerInner: it already has all the information it needs
to update metric on new+drop.
  • Loading branch information
jcsp authored Jul 17, 2024
1 parent da84a25 commit 0c236fa
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 0 deletions.
94 changes: 94 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});

#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum MetricLayerKind {
Delta,
Image,
}

static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_layer_bytes",
"Sum of layer physical sizes in bytes",
&["tenant_id", "shard_id", "timeline_id", "kind"]
)
.expect("failed to define a metric")
});

static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_layer_count",
"Number of layers that exist",
&["tenant_id", "shard_id", "timeline_id", "kind"]
)
.expect("failed to define a metric")
});

static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_archive_size",
Expand Down Expand Up @@ -2141,6 +2166,10 @@ pub(crate) struct TimelineMetrics {
pub last_record_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub(crate) layer_size_image: UIntGauge,
pub(crate) layer_count_image: UIntGauge,
pub(crate) layer_size_delta: UIntGauge,
pub(crate) layer_count_delta: UIntGauge,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
Expand Down Expand Up @@ -2223,6 +2252,42 @@ impl TimelineMetrics {
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();

let layer_size_image = TIMELINE_LAYER_SIZE
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Image.into(),
])
.unwrap();

let layer_count_image = TIMELINE_LAYER_COUNT
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Image.into(),
])
.unwrap();

let layer_size_delta = TIMELINE_LAYER_SIZE
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Delta.into(),
])
.unwrap();

let layer_count_delta = TIMELINE_LAYER_COUNT
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Delta.into(),
])
.unwrap();

let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
Expand Down Expand Up @@ -2277,6 +2342,10 @@ impl TimelineMetrics {
last_record_gauge,
pitr_history_size,
archival_size,
layer_size_image,
layer_count_image,
layer_size_delta,
layer_count_delta,
standby_horizon_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
Expand Down Expand Up @@ -2338,6 +2407,31 @@ impl TimelineMetrics {
let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Image.into(),
]);
let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Image.into(),
]);
let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Delta.into(),
]);
let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Delta.into(),
]);

let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
Expand Down
21 changes: 21 additions & 0 deletions pageserver/src/tenant/storage_layer/layer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,18 @@ impl Drop for LayerInner {
// and we could be delaying shutdown for nothing.
}

if let Some(timeline) = self.timeline.upgrade() {
// Only need to decrement metrics if the timeline still exists: otherwise
// it will have already de-registered these metrics via TimelineMetrics::shutdown
if self.desc.is_delta() {
timeline.metrics.layer_count_delta.dec();
timeline.metrics.layer_size_delta.sub(self.desc.file_size);
} else {
timeline.metrics.layer_count_image.dec();
timeline.metrics.layer_size_image.sub(self.desc.file_size);
}
}

if !*self.wanted_deleted.get_mut() {
return;
}
Expand Down Expand Up @@ -791,6 +803,15 @@ impl LayerInner {
(heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
};

// This object acts as a RAII guard on these metrics: increment on construction
if desc.is_delta() {
timeline.metrics.layer_count_delta.inc();
timeline.metrics.layer_size_delta.add(desc.file_size);
} else {
timeline.metrics.layer_count_image.inc();
timeline.metrics.layer_size_image.add(desc.file_size);
}

LayerInner {
conf,
debug_str: {
Expand Down
2 changes: 2 additions & 0 deletions test_runner/fixtures/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
"pageserver_smgr_query_seconds_sum",
"pageserver_archive_size",
"pageserver_pitr_history_size",
"pageserver_layer_bytes",
"pageserver_layer_count",
"pageserver_storage_operations_seconds_count_total",
"pageserver_storage_operations_seconds_sum_total",
"pageserver_evictions_total",
Expand Down

1 comment on commit 0c236fa

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3202 tests run: 3069 passed, 0 failed, 133 skipped (full report)


Code coverage* (full report)

  • functions: 32.7% (6998 of 21394 functions)
  • lines: 50.2% (55258 of 110159 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
0c236fa at 2024-07-17T22:25:51.615Z :recycle:

Please sign in to comment.