Skip to content

Commit

Permalink
pageserver: further refactoring from TenantId to TenantShardId (#6059)
Browse files Browse the repository at this point in the history
## Problem

In #5957, the most essential
types were updated to use TenantShardId rather than TenantId. That
unblocked other work, but didn't fully enable running multiple shards
from the same tenant on the same pageserver.

## Summary of changes

- Use TenantShardId in page cache key for materialized pages
- Update mgr.rs get_tenant() and list_tenants() functions to use a shard
id, and update all callers.
- Eliminate the exactly_one_or_none helper in mgr.rs and all code that
used it
- Convert timeline HTTP routes to use tenant_shard_id

Note on page cache:
```
struct MaterializedPageHashKey {
    /// Why is this TenantShardId rather than TenantId?
    ///
    /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant.  However, this
    /// this not the case for certain internally-generated pages (e.g. relation sizes).  In future, we may make this
    /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
    /// special-cased in some other way.
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    key: Key,
}
```
  • Loading branch information
jcsp authored Dec 11, 2023
1 parent 66a7a22 commit f1fc1fd
Show file tree
Hide file tree
Showing 21 changed files with 297 additions and 277 deletions.
2 changes: 1 addition & 1 deletion control_plane/src/bin/neon_local.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ fn print_timelines_tree(
info: t.clone(),
children: BTreeSet::new(),
name: timeline_name_mappings
.remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
.remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
},
)
})
Expand Down
2 changes: 1 addition & 1 deletion control_plane/src/tenant_migration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ pub fn migrate_tenant(
let found = other_ps_tenants
.into_iter()
.map(|t| t.id)
.any(|i| i == tenant_id);
.any(|i| i.tenant_id == tenant_id);
if !found {
continue;
}
Expand Down
8 changes: 4 additions & 4 deletions libs/pageserver_api/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ pub enum TenantAttachmentStatus {

#[derive(Serialize, Deserialize, Clone)]
pub struct TenantInfo {
pub id: TenantId,
pub id: TenantShardId,
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
pub state: TenantState,
/// Sum of the size of all layer files.
Expand All @@ -369,7 +369,7 @@ pub struct TenantInfo {
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
pub tenant_id: TenantId,
pub tenant_id: TenantShardId,
pub timeline_id: TimelineId,

pub ancestor_timeline_id: Option<TimelineId>,
Expand Down Expand Up @@ -823,7 +823,7 @@ mod tests {
fn test_tenantinfo_serde() {
// Test serialization/deserialization of TenantInfo
let original_active = TenantInfo {
id: TenantId::generate(),
id: TenantShardId::unsharded(TenantId::generate()),
state: TenantState::Active,
current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached,
Expand All @@ -840,7 +840,7 @@ mod tests {
});

let original_broken = TenantInfo {
id: TenantId::generate(),
id: TenantShardId::unsharded(TenantId::generate()),
state: TenantState::Broken {
reason: "reason".into(),
backtrace: "backtrace info".into(),
Expand Down
5 changes: 5 additions & 0 deletions libs/pageserver_api/src/shard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ impl TenantShardId {
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
ShardSlug(self)
}

/// Convenience for code that has special behavior on the 0th shard.
pub fn is_zero(&self) -> bool {
self.shard_number == ShardNumber(0)
}
}

/// Formatting helper
Expand Down
14 changes: 11 additions & 3 deletions pageserver/src/consumption_metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -269,12 +269,18 @@ async fn calculate_synthetic_size_worker(
}
};

for (tenant_id, tenant_state) in tenants {
for (tenant_shard_id, tenant_state) in tenants {
if tenant_state != TenantState::Active {
continue;
}

if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
if !tenant_shard_id.is_zero() {
// We only send consumption metrics from shard 0, so don't waste time calculating
// synthetic size on other shards.
continue;
}

if let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) {
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics.
Expand All @@ -286,7 +292,9 @@ async fn calculate_synthetic_size_worker(
{
return Ok(());
}
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
error!(
"failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"
);
}
}
}
Expand Down
10 changes: 2 additions & 8 deletions pageserver/src/consumption_metrics/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogi
use chrono::{DateTime, Utc};
use consumption_metrics::EventType;
use futures::stream::StreamExt;
use pageserver_api::shard::ShardNumber;
use std::{sync::Arc, time::SystemTime};
use utils::{
id::{TenantId, TimelineId},
Expand Down Expand Up @@ -198,12 +197,12 @@ pub(super) async fn collect_all_metrics(
};

let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
if state != TenantState::Active {
if state != TenantState::Active || !id.is_zero() {
None
} else {
crate::tenant::mgr::get_tenant(id, true)
.ok()
.map(|tenant| (id, tenant))
.map(|tenant| (id.tenant_id, tenant))
}
});

Expand All @@ -229,11 +228,6 @@ where
while let Some((tenant_id, tenant)) = tenants.next().await {
let mut tenant_resident_size = 0;

// Sharded tenants report all consumption metrics from shard zero
if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
continue;
}

for timeline in tenant.list_timelines() {
let timeline_id = timeline.timeline_id;

Expand Down
Loading

1 comment on commit f1fc1fd

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2216 tests run: 2126 passed, 0 failed, 90 skipped (full report)


Flaky tests (4)

Postgres 15

  • test_no_config[application/json]: release
  • test_statvfs_pressure_min_avail_bytes: debug
  • test_eviction_across_generations: debug

Postgres 14

  • test_competing_branchings_from_loading_race_to_ok_or_err: release

Code coverage (full report)

  • functions: 54.8% (9360 of 17077 functions)
  • lines: 82.1% (54415 of 66298 lines)

The comment gets automatically updated with the latest test results
f1fc1fd at 2023-12-11T17:08:08.150Z :recycle:

Please sign in to comment.