From 121b5e9a81dfa90154d9c6e5f59fc09fb22bf0a8 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 15 Jan 2023 15:55:30 +0200 Subject: [PATCH 01/44] Add Layer::overlaps method and use it in count_deltas to avoid unnessaory image layer generation refer #2948 --- pageserver/src/tenant/layer_map.rs | 18 ++++++++------- pageserver/src/tenant/storage_layer.rs | 10 ++++++++ .../src/tenant/storage_layer/delta_layer.rs | 23 +++++++++++++++++++ 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 9d8c825220d5..c7969f782890 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -473,18 +473,21 @@ where /// TODO The optimal number should probably be slightly higher than 1, but to /// implement that we need to plumb a lot more context into this function /// than just the current partition_range. - pub fn is_reimage_worthy(layer: &L, partition_range: &Range) -> bool { + pub fn is_reimage_worthy(layer: &L, partition_range: &Range) -> Result { + if !layer.overlaps(key_range)? { + return Ok(false); + } // Case 1 if !Self::is_l0(layer) { - return true; + return Ok(true); } // Case 2 if range_eq(partition_range, &(Key::MIN..Key::MAX)) { - return true; + return Ok(true); } - false + Ok(false) } /// Count the height of the tallest stack of reimage-worthy deltas @@ -531,7 +534,7 @@ where let kr = Key::from_i128(current_key)..Key::from_i128(change_key); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = Self::is_reimage_worthy(&val, key)? as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?; @@ -542,7 +545,6 @@ where } } } - current_key = change_key; current_val = change_val.clone(); } @@ -554,7 +556,7 @@ where let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = Self::is_reimage_worthy(&val, key)? as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?; max_stacked_deltas = std::cmp::max( @@ -577,7 +579,7 @@ where match self.search(key, lsn) { Some(search_result) => { if search_result.layer.is_incremental() { - (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize) + (Self::is_reimage_worthy(&search_result.layer, partition_range)? as usize) + self.get_difficulty(search_result.lsn_floor, key, partition_range) } else { 0 diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index e85359af1626..1b6bda87ca55 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -283,6 +283,11 @@ pub trait Layer: std::fmt::Debug + Send + Sync { /// Dump summary of the contents of the layer to stdout fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>; + + /// Checks if layer contains any entries belonging to the specified key range + fn overlaps(&self, key_range: &Range) -> Result { + Ok(range_overlaps(&self.get_key_range(), key_range)) + } } /// Returned by [`Layer::iter`] @@ -401,6 +406,11 @@ impl Layer for LayerDescriptor { fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { todo!() } + + /// Checks if layer contains any entries belonging to the specified key range + fn overlaps(&self, key_range: &Range) -> Result { + Ok(range_overlaps(&self.get_key_range(), key_range)) + } } impl From for LayerDescriptor { diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 9b322faa652e..60a34202bf4f 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -30,6 +30,7 @@ use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::storage_layer::range_overlaps; use crate::tenant::storage_layer::{ PersistentLayer, ValueReconstructResult, ValueReconstructState, }; @@ -315,6 +316,28 @@ impl Layer for DeltaLayer { Ok(()) } + fn overlaps(&self, key_range: &Range) -> anyhow::Result { + if !range_overlaps(&self.key_range, key_range) { + return Ok(false); + } + // Open the file and lock the metadata in memory + let inner = self.load()?; + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + let search_key = DeltaKey::from_key_lsn(&key_range.start, Lsn(0)); + let mut overlaps = false; + tree_reader.visit(&search_key.0, VisitDirection::Forwards, |key, value| { + let key = Key::from_slice(&key[..KEY_SIZE]); + overlaps = key < key_range.end; + false + })?; + Ok(overlaps) + } + fn get_value_reconstruct_data( &self, key: Key, From 7d1e6aca21584f8f9952d248fb49660b612d7381 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 15 Jan 2023 22:13:37 +0200 Subject: [PATCH 02/44] Remove wanrings --- pageserver/src/tenant/storage_layer/delta_layer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 60a34202bf4f..685edd159161 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -330,7 +330,7 @@ impl Layer for DeltaLayer { ); let search_key = DeltaKey::from_key_lsn(&key_range.start, Lsn(0)); let mut overlaps = false; - tree_reader.visit(&search_key.0, VisitDirection::Forwards, |key, value| { + tree_reader.visit(&search_key.0, VisitDirection::Forwards, |key, _value| { let key = Key::from_slice(&key[..KEY_SIZE]); overlaps = key < key_range.end; false From c14cc5b29ceaa57635112e8760676d630856d21b Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 20 Jan 2023 16:51:04 +0200 Subject: [PATCH 03/44] Store information about largest holes in layer map --- pageserver/src/tenant/layer_map.rs | 14 ++- pageserver/src/tenant/storage_layer.rs | 10 ++ .../src/tenant/storage_layer/delta_layer.rs | 114 +++++++++++++++--- pageserver/src/tenant/timeline.rs | 20 +-- 4 files changed, 131 insertions(+), 27 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index c7969f782890..4e573bf3f690 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -225,7 +225,8 @@ where let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); - match (latest_delta, latest_image) { + loop { + return match (latest_delta, latest_image) { (None, None) => None, (None, Some(image)) => { let lsn_floor = image.get_lsn_range().start; @@ -235,6 +236,11 @@ where }) } (Some(delta), None) => { + if let Ok(contains) = delta.overlaps(&(key..key.next())) { + if !contains { + continue; + } + } let lsn_floor = delta.get_lsn_range().start; Some(SearchResult { layer: delta, @@ -251,6 +257,11 @@ where lsn_floor: img_lsn, }) } else { + if let Ok(contains) = delta.overlaps(&(key..key.next())) { + if !contains { + continue; + } + } let lsn_floor = std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); Some(SearchResult { @@ -260,6 +271,7 @@ where } } } + } } /// Start a batch of updates, applied on drop diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 1b6bda87ca55..5833fd9f30bd 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -288,6 +288,11 @@ pub trait Layer: std::fmt::Debug + Send + Sync { fn overlaps(&self, key_range: &Range) -> Result { Ok(range_overlaps(&self.get_key_range(), key_range)) } + + /// Skip holes in this layer key range + fn get_occupied_ranges(&self) -> Result>> { + Ok(vec![self.get_key_range()]) + } } /// Returned by [`Layer::iter`] @@ -411,6 +416,11 @@ impl Layer for LayerDescriptor { fn overlaps(&self, key_range: &Range) -> Result { Ok(range_overlaps(&self.get_key_range(), key_range)) } + + /// Skip holes in this layer key range + fn get_occupied_ranges(&self) -> Result>> { + Ok(vec![self.get_key_range()]) + } } impl From for LayerDescriptor { diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 685edd159161..593002cc3ef1 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -41,6 +41,8 @@ use anyhow::{bail, ensure, Context, Result}; use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; +use std::cmp::Ordering; +use std::collections::BinaryHeap; use std::fs::{self, File}; use std::io::{BufWriter, Write}; use std::io::{Seek, SeekFrom}; @@ -61,6 +63,11 @@ use super::{ LayerKeyIter, LayerResidenceStatus, PathOrConf, }; +// Number of holes for delta layer kept in memory by layer map to speedup DeltaLayer::overlaps operation. +// To much number of layers can cause large memory footprint of layer map, +const MAX_CACHED_HOLES: usize = 10; // TODO: move it to tenant config? (not available in layer methods) +const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128; // TODO: use compaction_target? (see above) + /// /// Header stored in the beginning of the file /// @@ -171,6 +178,24 @@ impl DeltaKey { } } +/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap +#[derive(PartialEq, Eq)] +struct Hole(Range); + +impl Ord for Hole { + fn cmp(&self, other: &Self) -> Ordering { + let other_len = other.0.end.to_i128() - other.0.start.to_i128(); + let self_len = self.0.end.to_i128() - self.0.start.to_i128(); + other_len.cmp(&self_len) + } +} + +impl PartialOrd for Hole { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + /// DeltaLayer is the in-memory data structure associated with an on-disk delta /// file. /// @@ -214,6 +239,9 @@ pub struct DeltaLayerInner { /// Reader object for reading blocks from the file. (None if not loaded yet) file: Option>, + + /// Largest holes in this layer + holes: Option>, } impl std::fmt::Debug for DeltaLayerInner { @@ -316,26 +344,44 @@ impl Layer for DeltaLayer { Ok(()) } + fn get_occupied_ranges(&self) -> Result>> { + let inner = self.load()?; + if let Some(holes) = &inner.holes { + let mut occ = Vec::with_capacity(holes.len() + 1); + let key_range = self.get_key_range(); + let mut prev = key_range.start; + for hole in holes { + occ.push(prev..hole.0.start); + prev = hole.0.end; + } + occ.push(prev..key_range.end); + Ok(occ) + } else { + Ok(vec![self.get_key_range()]) + } + } + fn overlaps(&self, key_range: &Range) -> anyhow::Result { if !range_overlaps(&self.key_range, key_range) { - return Ok(false); + Ok(false) + } else { + let inner = self.load()?; + if let Some(holes) = &inner.holes { + let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) + { + Ok(index) => index, + Err(index) => { + if index == 0 { + return Ok(true); + } + index - 1 + } + }; + Ok(holes[start].0.end < key_range.end) + } else { + Ok(true) + } } - // Open the file and lock the metadata in memory - let inner = self.load()?; - let file = inner.file.as_ref().unwrap(); - let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - inner.index_start_blk, - inner.index_root_blk, - file, - ); - let search_key = DeltaKey::from_key_lsn(&key_range.start, Lsn(0)); - let mut overlaps = false; - tree_reader.visit(&search_key.0, VisitDirection::Forwards, |key, _value| { - let key = Key::from_slice(&key[..KEY_SIZE]); - overlaps = key < key_range.end; - false - })?; - Ok(overlaps) } fn get_value_reconstruct_data( @@ -608,6 +654,37 @@ impl DeltaLayer { debug!("loaded from {}", &path.display()); + // Construct vector with largest holes + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(MAX_CACHED_HOLES + 1); + let mut prev_key: Option = None; + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, _value| { + let curr = Key::from_slice(&key[..KEY_SIZE]); + if let Some(prev) = prev_key { + if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH { + heap.push(Hole(prev..curr)); + if heap.len() > MAX_CACHED_HOLES { + heap.pop(); // remove smallest hole + } + } + } + prev_key = Some(curr.next()); + true + }, + )?; + let mut holes = heap.into_vec(); + holes.sort_by_key(|hole| hole.0.start); + inner.holes = Some(holes); + inner.loaded = true; Ok(()) } @@ -634,6 +711,7 @@ impl DeltaLayer { file: None, index_start_blk: 0, index_root_blk: 0, + holes: None, }), } } @@ -664,6 +742,7 @@ impl DeltaLayer { file: None, index_start_blk: 0, index_root_blk: 0, + holes: None, }), }) } @@ -835,6 +914,7 @@ impl DeltaLayerWriterInner { file: None, index_start_blk, index_root_blk, + holes: None, }), }; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 838df6d88431..c74fedb9aa1b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3292,15 +3292,17 @@ impl Timeline { // If GC horizon is at 2500, we can remove layers A and B, but // we cannot remove C, even though it's older than 2500, because // the delta layer 2000-3000 depends on it. - if !layers - .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? - { - debug!( - "keeping {} because it is the latest layer", - l.filename().file_name() - ); - result.layers_not_updated += 1; - continue 'outer; + for occupied_range in &l.get_occupied_ranges()? { + if !layers + .image_layer_exists(occupied_range, &(l.get_lsn_range().end..new_gc_cutoff))? + { + debug!( + "keeping {} because it is the latest layer", + l.filename().file_name() + ); + result.layers_not_updated += 1; + continue 'outer; + } } // We didn't find any reason to keep this file, so remove it. From 9b2321ef13213c9c1dd311d4160b40aed4df0df7 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 26 Jan 2023 19:38:40 +0200 Subject: [PATCH 04/44] Merge with main --- pageserver/src/tenant/layer_map.rs | 99 ++++++++++++++++-------------- 1 file changed, 52 insertions(+), 47 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 4e573bf3f690..75637b67129d 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -220,58 +220,62 @@ where /// NOTE: This only searches the 'historic' layers, *not* the /// 'open' and 'frozen' layers! /// - pub fn search(&self, key: Key, end_lsn: Lsn) -> Option> { - let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; - let latest_delta = version.delta_coverage.query(key.to_i128()); - let latest_image = version.image_coverage.query(key.to_i128()); - - loop { - return match (latest_delta, latest_image) { - (None, None) => None, - (None, Some(image)) => { - let lsn_floor = image.get_lsn_range().start; - Some(SearchResult { - layer: image, - lsn_floor, - }) - } - (Some(delta), None) => { - if let Ok(contains) = delta.overlaps(&(key..key.next())) { - if !contains { - continue; - } - } - let lsn_floor = delta.get_lsn_range().start; - Some(SearchResult { - layer: delta, - lsn_floor, - }) - } - (Some(delta), Some(image)) => { - let img_lsn = image.get_lsn_range().start; - let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; - let image_exact_match = img_lsn + 1 == end_lsn; - if image_is_newer || image_exact_match { + pub fn search(&self, key: Key, mut end_lsn: Lsn) -> Option> { + loop { + let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; + let latest_delta = version.delta_coverage.query(key.to_i128()); + let latest_image = version.image_coverage.query(key.to_i128()); + + return match (latest_delta, latest_image) { + (None, None) => None, + (None, Some(image)) => { + let lsn_floor = image.get_lsn_range().start; Some(SearchResult { layer: image, - lsn_floor: img_lsn, + lsn_floor, }) - } else { - if let Ok(contains) = delta.overlaps(&(key..key.next())) { - if !contains { - continue; - } - } - let lsn_floor = - std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); + } + (Some(delta), None) => { + let lsn_floor = delta.get_lsn_range().start; + if let Ok(contains) = delta.overlaps(&(key..key.next())) { + if !contains { + end_lsn = lsn_floor; + continue; + } + } Some(SearchResult { layer: delta, lsn_floor, }) } - } + (Some(delta), Some(image)) => { + let img_lsn = image.get_lsn_range().start; + let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; + let image_exact_match = img_lsn + 1 == end_lsn; + if image_is_newer || image_exact_match { + Some(SearchResult { + layer: image, + lsn_floor: img_lsn, + }) + } else { + if let Ok(contains) = delta.overlaps(&(key..key.next())) { + if !contains { + end_lsn = delta.get_lsn_range().start; + continue; + } + } + let lsn_floor = std::cmp::max( + delta.get_lsn_range().start, + image.get_lsn_range().start + 1, + ); + Some(SearchResult { + layer: delta, + lsn_floor, + }) + } + } + }; } - } } /// Start a batch of updates, applied on drop @@ -486,9 +490,9 @@ where /// implement that we need to plumb a lot more context into this function /// than just the current partition_range. pub fn is_reimage_worthy(layer: &L, partition_range: &Range) -> Result { - if !layer.overlaps(key_range)? { - return Ok(false); - } + if !layer.overlaps(partition_range)? { + return Ok(false); + } // Case 1 if !Self::is_l0(layer) { return Ok(true); @@ -591,7 +595,8 @@ where match self.search(key, lsn) { Some(search_result) => { if search_result.layer.is_incremental() { - (Self::is_reimage_worthy(&search_result.layer, partition_range)? as usize) + (Self::is_reimage_worthy(&search_result.layer, partition_range).unwrap() + as usize) + self.get_difficulty(search_result.lsn_floor, key, partition_range) } else { 0 From 8f23a5e7ea1018f80cc146d7227de398aa78124a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 27 Jan 2023 16:47:20 +0200 Subject: [PATCH 05/44] Store information about holes in delta layers in S3 index file --- .../src/tenant/remote_timeline_client.rs | 8 ++-- .../tenant/remote_timeline_client/index.rs | 27 ++++++++++- pageserver/src/tenant/storage_layer.rs | 20 ++++---- .../src/tenant/storage_layer/delta_layer.rs | 45 +++++++++--------- .../src/tenant/storage_layer/remote_layer.rs | 47 ++++++++++++++++++- pageserver/src/tenant/timeline.rs | 20 +++++--- 6 files changed, 121 insertions(+), 46 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 985b480a764c..abb7f3b2a951 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -423,7 +423,7 @@ impl RemoteTimelineClient { // might be missing some information for the file; this allows us // to fill in the missing details. if layer_metadata.file_size().is_none() { - let new_metadata = LayerFileMetadata::new(downloaded_size); + let new_metadata = LayerFileMetadata::new(downloaded_size, None); let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) { @@ -1153,11 +1153,11 @@ mod tests { client.schedule_layer_file_upload( &layer_file_name_1, - &LayerFileMetadata::new(content_1.len() as u64), + &LayerFileMetadata::new(content_1.len() as u64, None), )?; client.schedule_layer_file_upload( &layer_file_name_2, - &LayerFileMetadata::new(content_2.len() as u64), + &LayerFileMetadata::new(content_2.len() as u64, None), )?; // Check that they are started immediately, not queued @@ -1209,7 +1209,7 @@ mod tests { std::fs::write(timeline_path.join("baz"), &content_baz)?; client.schedule_layer_file_upload( &layer_file_name_3, - &LayerFileMetadata::new(content_baz.len() as u64), + &LayerFileMetadata::new(content_baz.len() as u64, None), )?; client.schedule_layer_file_deletion(&[layer_file_name_1.clone()])?; { diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 420edae6cd44..774b0b52ee1e 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -9,6 +9,7 @@ use serde_with::{serde_as, DisplayFromStr}; use tracing::warn; use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::storage_layer::Hole; use crate::tenant::storage_layer::LayerFileName; use utils::lsn::Lsn; @@ -21,31 +22,41 @@ use utils::lsn::Lsn; #[cfg_attr(test, derive(Default))] pub struct LayerFileMetadata { file_size: Option, + holes: Option>, } impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { fn from(other: &IndexLayerMetadata) -> Self { LayerFileMetadata { file_size: other.file_size, + holes: other.holes.clone(), } } } impl LayerFileMetadata { - pub fn new(file_size: u64) -> Self { + pub fn new(file_size: u64, holes: Option>) -> Self { LayerFileMetadata { file_size: Some(file_size), + holes, } } /// This is used to initialize the metadata for remote layers, for which /// the metadata was missing from the index part file. - pub const MISSING: Self = LayerFileMetadata { file_size: None }; + pub const MISSING: Self = LayerFileMetadata { + file_size: None, + holes: None, + }; pub fn file_size(&self) -> Option { self.file_size } + pub fn holes(&self) -> Option> { + self.holes.clone() + } + /// Metadata has holes due to version upgrades. This method is called to upgrade self with the /// other value. /// @@ -59,6 +70,16 @@ impl LayerFileMetadata { changed = true; } + if self.holes != other.holes { + self.holes = other + .holes + .as_ref() + .or(self.holes.as_ref()) + .clone() + .cloned(); + changed = true; + } + changed } } @@ -233,12 +254,14 @@ impl IndexPart { #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] pub struct IndexLayerMetadata { pub(super) file_size: Option, + pub(super) holes: Option>, } impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { fn from(other: &'_ LayerFileMetadata) -> Self { IndexLayerMetadata { file_size: other.file_size, + holes: other.holes.clone(), } } } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 5833fd9f30bd..f63163832875 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -19,6 +19,7 @@ use pageserver_api::models::LayerAccessKind; use pageserver_api::models::{ HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, }; +use serde::{Deserialize, Serialize}; use std::ops::Range; use std::path::PathBuf; use std::sync::{Arc, Mutex}; @@ -293,6 +294,11 @@ pub trait Layer: std::fmt::Debug + Send + Sync { fn get_occupied_ranges(&self) -> Result>> { Ok(vec![self.get_key_range()]) } + + /// Get list of holes + fn get_holes(&self) -> Result>> { + Ok(None) + } } /// Returned by [`Layer::iter`] @@ -381,6 +387,10 @@ pub struct LayerDescriptor { pub short_id: String, } +/// Wrapper for key range to provide reverse ordering by range length (for BinaryHeap) +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct Hole(Range); + impl Layer for LayerDescriptor { fn get_key_range(&self) -> Range { self.key.clone() @@ -411,16 +421,6 @@ impl Layer for LayerDescriptor { fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { todo!() } - - /// Checks if layer contains any entries belonging to the specified key range - fn overlaps(&self, key_range: &Range) -> Result { - Ok(range_overlaps(&self.get_key_range(), key_range)) - } - - /// Skip holes in this layer key range - fn get_occupied_ranges(&self) -> Result>> { - Ok(vec![self.get_key_range()]) - } } impl From for LayerDescriptor { diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 593002cc3ef1..91b447dcf32c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -32,7 +32,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReade use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; use crate::tenant::storage_layer::range_overlaps; use crate::tenant::storage_layer::{ - PersistentLayer, ValueReconstructResult, ValueReconstructState, + Hole, PersistentLayer, ValueReconstructResult, ValueReconstructState, }; use crate::virtual_file::VirtualFile; use crate::{walrecord, TEMP_FILE_SUFFIX}; @@ -68,6 +68,20 @@ use super::{ const MAX_CACHED_HOLES: usize = 10; // TODO: move it to tenant config? (not available in layer methods) const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128; // TODO: use compaction_target? (see above) +impl Ord for Hole { + fn cmp(&self, other: &Self) -> Ordering { + let other_len = other.0.end.to_i128() - other.0.start.to_i128(); + let self_len = self.0.end.to_i128() - self.0.start.to_i128(); + other_len.cmp(&self_len) + } +} + +impl PartialOrd for Hole { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + /// /// Header stored in the beginning of the file /// @@ -177,27 +191,6 @@ impl DeltaKey { Lsn(u64::from_be_bytes(lsn_buf)) } } - -/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap -#[derive(PartialEq, Eq)] -struct Hole(Range); - -impl Ord for Hole { - fn cmp(&self, other: &Self) -> Ordering { - let other_len = other.0.end.to_i128() - other.0.start.to_i128(); - let self_len = self.0.end.to_i128() - self.0.start.to_i128(); - other_len.cmp(&self_len) - } -} - -impl PartialOrd for Hole { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -/// DeltaLayer is the in-memory data structure associated with an on-disk delta -/// file. /// /// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer /// is in "loaded" state, we have a copy of the index in memory, in 'inner'. @@ -361,6 +354,11 @@ impl Layer for DeltaLayer { } } + fn get_holes(&self) -> Result>> { + let inner = self.load()?; + Ok(inner.holes.clone()) + } + fn overlaps(&self, key_range: &Range) -> anyhow::Result { if !range_overlaps(&self.key_range, key_range) { Ok(false) @@ -697,6 +695,7 @@ impl DeltaLayer { filename: &DeltaFileName, file_size: u64, access_stats: LayerAccessStats, + holes: Option>, ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), @@ -711,7 +710,7 @@ impl DeltaLayer { file: None, index_start_blk: 0, index_root_blk: 0, - holes: None, + holes, }), } } diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 7391875d0c39..7f5ae959309e 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -5,7 +5,9 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::repository::Key; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; -use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use crate::tenant::storage_layer::{ + range_overlaps, Hole, Layer, ValueReconstructResult, ValueReconstructState, +}; use anyhow::{bail, Result}; use pageserver_api::models::HistoricLayerInfo; use std::ops::Range; @@ -87,6 +89,48 @@ impl Layer for RemoteLayer { self.is_incremental } + fn get_occupied_ranges(&self) -> Result>> { + if let Some(holes) = &self.layer_metadata.holes() { + let mut occ = Vec::with_capacity(holes.len() + 1); + let key_range = self.get_key_range(); + let mut prev = key_range.start; + for hole in holes { + occ.push(prev..hole.0.start); + prev = hole.0.end; + } + occ.push(prev..key_range.end); + Ok(occ) + } else { + Ok(vec![self.get_key_range()]) + } + } + + fn get_holes(&self) -> Result>> { + Ok(self.layer_metadata.holes().clone()) + } + + fn overlaps(&self, key_range: &Range) -> anyhow::Result { + if !range_overlaps(&self.key_range, key_range) { + Ok(false) + } else { + if let Some(holes) = &self.layer_metadata.holes() { + let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) + { + Ok(index) => index, + Err(index) => { + if index == 0 { + return Ok(true); + } + index - 1 + } + }; + Ok(holes[start].0.end < key_range.end) + } else { + Ok(true) + } + } + } + /// debugging function to print out the contents of the layer fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { println!( @@ -251,6 +295,7 @@ impl RemoteLayer { file_size, self.access_stats .clone_for_residence_change(LayerResidenceStatus::Resident), + self.get_holes().unwrap(), )) } else { let fname = ImageFileName { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c74fedb9aa1b..e8646225e70e 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1312,6 +1312,7 @@ impl Timeline { &deltafilename, file_size, LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + None, ); trace!("found layer {}", layer.path().display()); @@ -1552,8 +1553,10 @@ impl Timeline { .with_context(|| format!("failed to get file {layer_path:?} metadata"))? .len(); info!("scheduling {layer_path:?} for upload"); - remote_client - .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?; + remote_client.schedule_layer_file_upload( + layer_name, + &LayerFileMetadata::new(layer_size, layer.get_holes()?), + )?; } remote_client.schedule_index_upload_for_file_changes()?; @@ -2484,6 +2487,8 @@ impl Timeline { self.conf.timeline_path(&self.timeline_id, &self.tenant_id), ])?; + let holes = new_delta.get_holes()?; + // Add it to the layer map self.layers .write() @@ -2499,7 +2504,7 @@ impl Timeline { self.metrics.num_persistent_files_created.inc_by(1); self.metrics.persistent_bytes_written.inc_by(sz); - Ok((new_delta_filename, LayerFileMetadata::new(sz))) + Ok((new_delta_filename, LayerFileMetadata::new(sz, holes))) } async fn repartition( @@ -2667,7 +2672,7 @@ impl Timeline { .metadata() .with_context(|| format!("reading metadata of layer file {}", path.file_name()))?; - layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len())); + layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len(), None)); self.metrics .resident_physical_size_gauge @@ -2988,7 +2993,7 @@ impl Timeline { if let Some(remote_client) = &self.remote_client { remote_client.schedule_layer_file_upload( &l.filename(), - &LayerFileMetadata::new(metadata.len()), + &LayerFileMetadata::new(metadata.len(), l.get_holes()?), )?; } @@ -2997,7 +3002,10 @@ impl Timeline { .resident_physical_size_gauge .add(metadata.len()); - new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); + new_layer_paths.insert( + new_delta_path, + LayerFileMetadata::new(metadata.len(), l.get_holes()?), + ); let x: Arc = Arc::new(l); updates.insert_historic(x); } From 652fd5a1f3eeab6e64e20ff4d6e2d4eda41fee40 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 27 Jan 2023 17:14:50 +0200 Subject: [PATCH 06/44] Remove excessive clone() when merging holes --- pageserver/src/tenant/remote_timeline_client/index.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 774b0b52ee1e..f48ea145d1cd 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -71,12 +71,7 @@ impl LayerFileMetadata { } if self.holes != other.holes { - self.holes = other - .holes - .as_ref() - .or(self.holes.as_ref()) - .clone() - .cloned(); + self.holes = other.holes.as_ref().or(self.holes.as_ref()).cloned(); changed = true; } From aa89e26f3c4e6e671b4813e0f0e4194386389fdb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 27 Jan 2023 19:18:37 +0200 Subject: [PATCH 07/44] Fix unit tests building --- pageserver/src/tenant/remote_timeline_client/index.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index f48ea145d1cd..e046d1a36c26 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -306,11 +306,13 @@ mod tests { layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { file_size: Some(25600000), + holes: None, }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: Some(9007199254741001), + holes: None, }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), @@ -344,11 +346,13 @@ mod tests { layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { file_size: Some(25600000), + holes: None, }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: Some(9007199254741001), + holes: None, }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), From 119afc3be3f5980010500d0489eec04a37431c6a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 27 Jan 2023 19:42:55 +0200 Subject: [PATCH 08/44] Make clippy happy --- .../src/tenant/storage_layer/remote_layer.rs | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 7f5ae959309e..24e59fb1cca7 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -106,28 +106,25 @@ impl Layer for RemoteLayer { } fn get_holes(&self) -> Result>> { - Ok(self.layer_metadata.holes().clone()) + Ok(self.layer_metadata.holes()) } fn overlaps(&self, key_range: &Range) -> anyhow::Result { if !range_overlaps(&self.key_range, key_range) { Ok(false) - } else { - if let Some(holes) = &self.layer_metadata.holes() { - let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) - { - Ok(index) => index, - Err(index) => { - if index == 0 { - return Ok(true); - } - index - 1 + } else if let Some(holes) = &self.layer_metadata.holes() { + let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) { + Ok(index) => index, + Err(index) => { + if index == 0 { + return Ok(true); } - }; - Ok(holes[start].0.end < key_range.end) - } else { - Ok(true) - } + index - 1 + } + }; + Ok(holes[start].0.end < key_range.end) + } else { + Ok(true) } } From f63c663e5751017b9ff3683f045ce4f63711f215 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 28 Jan 2023 12:21:56 +0200 Subject: [PATCH 09/44] Add traces to debug physical layer size calculation --- pageserver/src/tenant/timeline.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e8646225e70e..f0a15a2be37d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -544,9 +544,11 @@ impl Timeline { let mut no_size_cnt = 0; for l in layer_map.iter_historic_layers() { let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1)); + info!("Layer {} size {}", l.short_id(), l_size); size += l_size; no_size_cnt += l_no_size; } + info!("Total size {} no_size_cnt {}", size, no_size_cnt); if no_size_cnt == 0 { LayerSizeSum::Accurate(size) } else { From 8b85811aed3dca32f52f27a84479d077ca3952f9 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 28 Jan 2023 23:23:36 +0200 Subject: [PATCH 10/44] Temporary disarm overlaps --- .../src/tenant/storage_layer/delta_layer.rs | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 91b447dcf32c..d95f88c0b488 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -363,22 +363,25 @@ impl Layer for DeltaLayer { if !range_overlaps(&self.key_range, key_range) { Ok(false) } else { - let inner = self.load()?; - if let Some(holes) = &inner.holes { - let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) - { - Ok(index) => index, - Err(index) => { - if index == 0 { - return Ok(true); + Ok(true) + /* + let inner = self.load()?; + if let Some(holes) = &inner.holes { + let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) + { + Ok(index) => index, + Err(index) => { + if index == 0 { + return Ok(true); + } + index - 1 + } + }; + Ok(holes[start].0.end < key_range.end) + } else { + Ok(true) } - index - 1 - } - }; - Ok(holes[start].0.end < key_range.end) - } else { - Ok(true) - } + */ } } From 4099101a29a55c9196e6cc81577e9c3614baa17a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 29 Jan 2023 09:24:05 +0200 Subject: [PATCH 11/44] Revert "Temporary disarm overlaps" This reverts commit dbc0078e2b9e0082d7bc854bab040b9862d2b3df. --- .../src/tenant/storage_layer/delta_layer.rs | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index d95f88c0b488..91b447dcf32c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -363,25 +363,22 @@ impl Layer for DeltaLayer { if !range_overlaps(&self.key_range, key_range) { Ok(false) } else { - Ok(true) - /* - let inner = self.load()?; - if let Some(holes) = &inner.holes { - let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) - { - Ok(index) => index, - Err(index) => { - if index == 0 { - return Ok(true); - } - index - 1 - } - }; - Ok(holes[start].0.end < key_range.end) - } else { - Ok(true) + let inner = self.load()?; + if let Some(holes) = &inner.holes { + let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) + { + Ok(index) => index, + Err(index) => { + if index == 0 { + return Ok(true); } - */ + index - 1 + } + }; + Ok(holes[start].0.end < key_range.end) + } else { + Ok(true) + } } } From 38754b2cc0bb6482ed7c526413e6fbddec905be1 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 29 Jan 2023 09:24:15 +0200 Subject: [PATCH 12/44] Revert "Make clippy happy" This reverts commit 390d144d7cc9d6df4c1de3f486071bc024aa459c. --- .../src/tenant/storage_layer/remote_layer.rs | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 24e59fb1cca7..7f5ae959309e 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -106,25 +106,28 @@ impl Layer for RemoteLayer { } fn get_holes(&self) -> Result>> { - Ok(self.layer_metadata.holes()) + Ok(self.layer_metadata.holes().clone()) } fn overlaps(&self, key_range: &Range) -> anyhow::Result { if !range_overlaps(&self.key_range, key_range) { Ok(false) - } else if let Some(holes) = &self.layer_metadata.holes() { - let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) { - Ok(index) => index, - Err(index) => { - if index == 0 { - return Ok(true); - } - index - 1 - } - }; - Ok(holes[start].0.end < key_range.end) } else { - Ok(true) + if let Some(holes) = &self.layer_metadata.holes() { + let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) + { + Ok(index) => index, + Err(index) => { + if index == 0 { + return Ok(true); + } + index - 1 + } + }; + Ok(holes[start].0.end < key_range.end) + } else { + Ok(true) + } } } From 4f38224a9e2e61e93b347657fb6e5fe8c245ac6a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 29 Jan 2023 09:27:16 +0200 Subject: [PATCH 13/44] Ad delay to test_on_demand_download --- test_runner/regress/test_ondemand_download.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 3551f27cad56..fe4d080f2bcb 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -4,6 +4,7 @@ from pathlib import Path import pytest +import time from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -212,6 +213,8 @@ def get_resident_physical_size(): log.info(filled_size) assert filled_current_physical == filled_size, "we don't yet do layer eviction" + time.sleep(3) + env.pageserver.stop() # remove all the layer files From c2be42b4fcdcbe5b8b2373108af47e54df8f9b0a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 29 Jan 2023 21:53:00 +0200 Subject: [PATCH 14/44] Revert "Add traces to debug physical layer size calculation" This reverts commit 0ccb7a03c8d091ccee3d1c85308a66ea732aa7bc. --- pageserver/src/tenant/timeline.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f0a15a2be37d..e8646225e70e 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -544,11 +544,9 @@ impl Timeline { let mut no_size_cnt = 0; for l in layer_map.iter_historic_layers() { let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1)); - info!("Layer {} size {}", l.short_id(), l_size); size += l_size; no_size_cnt += l_no_size; } - info!("Total size {} no_size_cnt {}", size, no_size_cnt); if no_size_cnt == 0 { LayerSizeSum::Accurate(size) } else { From 310098469cad378727ad6d99ae882665ce6c4509 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 27 Jan 2023 19:42:55 +0200 Subject: [PATCH 15/44] Make clippy happy --- .../src/tenant/storage_layer/remote_layer.rs | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 7f5ae959309e..24e59fb1cca7 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -106,28 +106,25 @@ impl Layer for RemoteLayer { } fn get_holes(&self) -> Result>> { - Ok(self.layer_metadata.holes().clone()) + Ok(self.layer_metadata.holes()) } fn overlaps(&self, key_range: &Range) -> anyhow::Result { if !range_overlaps(&self.key_range, key_range) { Ok(false) - } else { - if let Some(holes) = &self.layer_metadata.holes() { - let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) - { - Ok(index) => index, - Err(index) => { - if index == 0 { - return Ok(true); - } - index - 1 + } else if let Some(holes) = &self.layer_metadata.holes() { + let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) { + Ok(index) => index, + Err(index) => { + if index == 0 { + return Ok(true); } - }; - Ok(holes[start].0.end < key_range.end) - } else { - Ok(true) - } + index - 1 + } + }; + Ok(holes[start].0.end < key_range.end) + } else { + Ok(true) } } From 08f450e42be44e45e9b18d4c1de39c42e9e38bc6 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 29 Jan 2023 21:59:19 +0200 Subject: [PATCH 16/44] Ignore errors in find_lsn_for_timestamp --- pageserver/src/tenant/timeline.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e8646225e70e..2a481aade513 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3080,23 +3080,23 @@ impl Timeline { if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await? { - LsnForTimestamp::Present(lsn) => lsn, - LsnForTimestamp::Future(lsn) => { + match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await { + Ok(LsnForTimestamp::Present(lsn)) => lsn, + Ok(LsnForTimestamp::Future(lsn)) => { // The timestamp is in the future. That sounds impossible, // but what it really means is that there hasn't been // any commits since the cutoff timestamp. debug!("future({})", lsn); cutoff_horizon } - LsnForTimestamp::Past(lsn) => { + Ok(LsnForTimestamp::Past(lsn)) => { debug!("past({})", lsn); // conservative, safe default is to remove nothing, when we // have no commit timestamp data available *self.get_latest_gc_cutoff_lsn() } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); + _ => { + //debug!("nodata({})", lsn); // conservative, safe default is to remove nothing, when we // have no commit timestamp data available *self.get_latest_gc_cutoff_lsn() From d7c09ca96a75121402d96a89b1f43badc5e73fce Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 30 Jan 2023 13:42:42 +0200 Subject: [PATCH 17/44] Revert "Ignore errors in find_lsn_for_timestamp" This reverts commit 8af69f0b63d851c38bd7532dde06e3c8e15e3b48. --- pageserver/src/tenant/timeline.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2a481aade513..e8646225e70e 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3080,23 +3080,23 @@ impl Timeline { if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await { - Ok(LsnForTimestamp::Present(lsn)) => lsn, - Ok(LsnForTimestamp::Future(lsn)) => { + match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await? { + LsnForTimestamp::Present(lsn) => lsn, + LsnForTimestamp::Future(lsn) => { // The timestamp is in the future. That sounds impossible, // but what it really means is that there hasn't been // any commits since the cutoff timestamp. debug!("future({})", lsn); cutoff_horizon } - Ok(LsnForTimestamp::Past(lsn)) => { + LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); // conservative, safe default is to remove nothing, when we // have no commit timestamp data available *self.get_latest_gc_cutoff_lsn() } - _ => { - //debug!("nodata({})", lsn); + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); // conservative, safe default is to remove nothing, when we // have no commit timestamp data available *self.get_latest_gc_cutoff_lsn() From c05095f694c2af7fd8dd5b88787b65aa1e39a912 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 30 Jan 2023 13:46:52 +0200 Subject: [PATCH 18/44] Temp: do not use information about holes in GC --- pageserver/src/tenant/timeline.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e8646225e70e..76ab981d65b7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3300,7 +3300,9 @@ impl Timeline { // If GC horizon is at 2500, we can remove layers A and B, but // we cannot remove C, even though it's older than 2500, because // the delta layer 2000-3000 depends on it. - for occupied_range in &l.get_occupied_ranges()? { + //for occupied_range in &l.get_occupied_ranges()? + let occupied_range = &l.get_key_range(); + { if !layers .image_layer_exists(occupied_range, &(l.get_lsn_range().end..new_gc_cutoff))? { From 0b32d14ee9518c137a30d0d1efad0541d8998126 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 30 Jan 2023 16:14:37 +0200 Subject: [PATCH 19/44] Fix LayerMap::search method --- pageserver/benches/bench_layer_map.rs | 2 +- pageserver/src/tenant/layer_map.rs | 22 ++++++++++++++-------- pageserver/src/tenant/timeline.rs | 4 +--- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5edfa84d8a2a..ba6ae7c205a8 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -206,7 +206,7 @@ fn bench_sequential(c: &mut Criterion) { let now = Instant::now(); let mut layer_map = LayerMap::default(); let mut updates = layer_map.batch_update(); - for i in 0..100_000 { + for i in 1..100_000 { let i32 = (i as u32) % 100; let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); let layer = LayerDescriptor { diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 75637b67129d..193c5d42ba35 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -220,13 +220,13 @@ where /// NOTE: This only searches the 'historic' layers, *not* the /// 'open' and 'frozen' layers! /// - pub fn search(&self, key: Key, mut end_lsn: Lsn) -> Option> { - loop { - let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; - let latest_delta = version.delta_coverage.query(key.to_i128()); - let latest_image = version.image_coverage.query(key.to_i128()); + pub fn search(&self, key: Key, end_lsn: Lsn) -> Option> { + let mut version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; + let mut latest_delta = version.delta_coverage.query(key.to_i128()); + let latest_image = version.image_coverage.query(key.to_i128()); - return match (latest_delta, latest_image) { + loop { + return match (latest_delta, latest_image.clone()) { (None, None) => None, (None, Some(image)) => { let lsn_floor = image.get_lsn_range().start; @@ -239,7 +239,8 @@ where let lsn_floor = delta.get_lsn_range().start; if let Ok(contains) = delta.overlaps(&(key..key.next())) { if !contains { - end_lsn = lsn_floor; + version = self.historic.get().unwrap().get_version(lsn_floor.0 - 1)?; + latest_delta = version.delta_coverage.query(key.to_i128()); continue; } } @@ -260,7 +261,12 @@ where } else { if let Ok(contains) = delta.overlaps(&(key..key.next())) { if !contains { - end_lsn = delta.get_lsn_range().start; + version = self + .historic + .get() + .unwrap() + .get_version(delta.get_lsn_range().start.0 - 1)?; + latest_delta = version.delta_coverage.query(key.to_i128()); continue; } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 76ab981d65b7..e8646225e70e 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3300,9 +3300,7 @@ impl Timeline { // If GC horizon is at 2500, we can remove layers A and B, but // we cannot remove C, even though it's older than 2500, because // the delta layer 2000-3000 depends on it. - //for occupied_range in &l.get_occupied_ranges()? - let occupied_range = &l.get_key_range(); - { + for occupied_range in &l.get_occupied_ranges()? { if !layers .image_layer_exists(occupied_range, &(l.get_lsn_range().end..new_gc_cutoff))? { From 082404432ed9bae1cd2b0d9d5f05172040a12e1f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 30 Jan 2023 21:50:38 +0200 Subject: [PATCH 20/44] Trnsacte index_part.json file in test_tenant_upgrades_index_json_from_v0 --- test_runner/regress/test_ondemand_download.py | 2 +- test_runner/regress/test_tenants_with_remote_storage.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index fe4d080f2bcb..df2f797ab930 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -1,10 +1,10 @@ # It's possible to run any regular test with the local fs remote storage via # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... +import time from pathlib import Path import pytest -import time from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 6da6a4d4463c..769bc10280b8 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -280,6 +280,7 @@ def test_tenant_upgrades_index_json_from_v0( timeline_file.seek(0) json.dump(v0_index_part, timeline_file) + timeline_file.truncate(timeline_file.tell()) env.pageserver.start() pageserver_http = env.pageserver.http_client() From 3dc4ab3c68e9f6d88ddcbb9940378fbe22546da9 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 31 Jan 2023 10:13:01 +0200 Subject: [PATCH 21/44] Add holes information while upgrading layer metadata --- .../src/tenant/remote_timeline_client.rs | 61 +++++++++++-------- .../src/tenant/storage_layer/remote_layer.rs | 2 +- pageserver/src/tenant/timeline.rs | 18 ++++++ 3 files changed, 53 insertions(+), 28 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index abb7f3b2a951..205989666039 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -419,33 +419,6 @@ impl RemoteTimelineClient { .await? }; - // Update the metadata for given layer file. The remote index file - // might be missing some information for the file; this allows us - // to fill in the missing details. - if layer_metadata.file_size().is_none() { - let new_metadata = LayerFileMetadata::new(downloaded_size, None); - let mut guard = self.upload_queue.lock().unwrap(); - let upload_queue = guard.initialized_mut()?; - if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) { - if upgraded.merge(&new_metadata) { - upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; - } - // If we don't do an index file upload inbetween here and restart, - // the value will go back down after pageserver restart, since we will - // have lost this data point. - // But, we upload index part fairly frequently, and restart pageserver rarely. - // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner. - self.metrics - .remote_physical_size_gauge() - .add(downloaded_size); - } else { - // The file should exist, since we just downloaded it. - warn!( - "downloaded file {:?} not found in local copy of the index file", - layer_file_name - ); - } - } Ok(downloaded_size) } @@ -453,6 +426,40 @@ impl RemoteTimelineClient { // Upload operations. // + /// + /// Upgrade layer metadata + /// + pub async fn upgrade_layer_metadata( + &self, + layer_file_name: &LayerFileName, + old_metadata: &LayerFileMetadata, + new_metadata: &LayerFileMetadata, + ) { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut().unwrap(); + if let Some(upgraded) = upload_queue.latest_files.get_mut(&layer_file_name) { + if upgraded.merge(&new_metadata) { + upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; + } + // If we don't do an index file upload inbetween here and restart, + // the value will go back down after pageserver restart, since we will + // have lost this data point. + // But, we upload index part fairly frequently, and restart pageserver rarely. + // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner. + if old_metadata.file_size().is_none() { + self.metrics + .remote_physical_size_gauge() + .add(new_metadata.file_size().unwrap_or(0)); + } + } else { + // The file should exist, since we just downloaded it. + warn!( + "downloaded file {:?} not found in local copy of the index file", + &layer_file_name + ); + } + } + /// /// Launch an index-file upload operation in the background, with /// updated metadata. diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 24e59fb1cca7..3c4527d7292c 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -44,7 +44,7 @@ pub struct RemoteLayer { pub layer_metadata: LayerFileMetadata, - is_delta: bool, + pub is_delta: bool, is_incremental: bool, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e8646225e70e..0ac278ae861a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3504,6 +3504,24 @@ impl Timeline { // Download complete. Replace the RemoteLayer with the corresponding // Delta- or ImageLayer in the layer map. let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size); + + // Update the metadata for given layer file. The remote index file + // might be missing some information for the file; this allows us + // to fill in the missing details. + if remote_layer.layer_metadata.file_size().is_none() + || (remote_layer.is_delta && remote_layer.layer_metadata.holes().is_none()) + { + let new_metadata = + LayerFileMetadata::new(*size, new_layer.get_holes().unwrap()); + remote_client + .upgrade_layer_metadata( + &remote_layer.file_name, + &remote_layer.layer_metadata, + &new_metadata, + ) + .await; + } + let mut layers = self_clone.layers.write().unwrap(); let mut updates = layers.batch_update(); { From eef2ab64441c46821d73aa55c8bcdf63c44a74b3 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 31 Jan 2023 10:38:58 +0200 Subject: [PATCH 22/44] Make clippy happy --- pageserver/src/tenant/remote_timeline_client.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 205989666039..cf5ed9226427 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -437,8 +437,8 @@ impl RemoteTimelineClient { ) { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); - if let Some(upgraded) = upload_queue.latest_files.get_mut(&layer_file_name) { - if upgraded.merge(&new_metadata) { + if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) { + if upgraded.merge(new_metadata) { upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; } // If we don't do an index file upload inbetween here and restart, @@ -455,7 +455,7 @@ impl RemoteTimelineClient { // The file should exist, since we just downloaded it. warn!( "downloaded file {:?} not found in local copy of the index file", - &layer_file_name + layer_file_name ); } } From 6c54d0d4830617a9b8e5cf4c831276dd9df5daff Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 31 Jan 2023 18:45:28 +0200 Subject: [PATCH 23/44] Provide custom serializer for Hole --- pageserver/src/tenant/layer_map.rs | 6 +-- pageserver/src/tenant/storage_layer.rs | 70 ++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 193c5d42ba35..798e585b40e4 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -49,7 +49,7 @@ mod layer_coverage; use crate::context::RequestContext; use crate::keyspace::KeyPartitioning; use crate::metrics::NUM_ONDISK_LAYERS; -use crate::repository::Key; +use crate::repository::{singleton_range, Key}; use crate::tenant::storage_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; use anyhow::Result; @@ -237,7 +237,7 @@ where } (Some(delta), None) => { let lsn_floor = delta.get_lsn_range().start; - if let Ok(contains) = delta.overlaps(&(key..key.next())) { + if let Ok(contains) = delta.overlaps(&singleton_range(key)) { if !contains { version = self.historic.get().unwrap().get_version(lsn_floor.0 - 1)?; latest_delta = version.delta_coverage.query(key.to_i128()); @@ -259,7 +259,7 @@ where lsn_floor: img_lsn, }) } else { - if let Ok(contains) = delta.overlaps(&(key..key.next())) { + if let Ok(contains) = delta.overlaps(&singleton_range(key)) { if !contains { version = self .historic diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index f63163832875..2cdaa79e1060 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -19,7 +19,10 @@ use pageserver_api::models::LayerAccessKind; use pageserver_api::models::{ HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, }; -use serde::{Deserialize, Serialize}; +use pageserver_api::models::HistoricLayerInfo; +use serde::ser::SerializeMap; +use serde::{de::MapAccess, de::Visitor, Deserialize, Deserializer, Serialize, Serializer}; +use std::fmt; use std::ops::Range; use std::path::PathBuf; use std::sync::{Arc, Mutex}; @@ -295,7 +298,11 @@ pub trait Layer: std::fmt::Debug + Send + Sync { Ok(vec![self.get_key_range()]) } - /// Get list of holes + /// Get list of holes in key range: returns up to MAX_CACHED_HOLES largest holes, ignoring any that are smaller + /// than MIN_HOLE_LENGTH. + /// Only delta layers can contain holes. Image is consdered as always dense, despite to the fact that it doesn't + /// contain all possible key values in the specified range: there are may be no keys in the storage belonging + /// to the image layer range but not present in the image layer. fn get_holes(&self) -> Result>> { Ok(None) } @@ -388,9 +395,66 @@ pub struct LayerDescriptor { } /// Wrapper for key range to provide reverse ordering by range length (for BinaryHeap) -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Hole(Range); +impl Serialize for Hole { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut hole = serializer.serialize_map(Some(2))?; + hole.serialize_entry("start", &self.0.start.to_string())?; + hole.serialize_entry("end", &self.0.end.to_string())?; + hole.end() + } +} + +struct HoleVisitor; + +impl<'de> Visitor<'de> for HoleVisitor { + type Value = Hole; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "a map with keys 'start' and 'end'") + } + + fn visit_map(self, mut map: M) -> Result + where + M: MapAccess<'de>, + { + let mut start = None; + let mut end = None; + + while let Some(k) = map.next_key::<&str>()? { + if k == "start" { + start = Some(map.next_value()?); + } else if k == "end" { + end = Some(map.next_value()?); + } else { + return Err(serde::de::Error::custom(&format!("Invalid key: {}", k))); + } + } + + if start.is_none() || end.is_none() { + return Err(serde::de::Error::custom("Missing start or end")); + } + + Ok(Hole( + Key::from_hex(start.unwrap()).unwrap()..Key::from_hex(end.unwrap()).unwrap(), + )) + } +} + +impl<'de> Deserialize<'de> for Hole { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + deserializer.deserialize_map(HoleVisitor) + } +} + impl Layer for LayerDescriptor { fn get_key_range(&self) -> Range { self.key.clone() From d63a7450affdbfe9131be6c5572e6f68cdad49ad Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 1 Feb 2023 16:54:18 +0200 Subject: [PATCH 24/44] Propagate RequestContext --- pageserver/benches/bench_layer_map.rs | 20 +++++--- pageserver/src/task_mgr.rs | 2 + pageserver/src/tenant.rs | 11 ++-- pageserver/src/tenant/layer_map.rs | 50 ++++++++++++------- pageserver/src/tenant/storage_layer.rs | 6 +-- .../src/tenant/storage_layer/delta_layer.rs | 12 ++--- .../src/tenant/storage_layer/remote_layer.rs | 8 +-- pageserver/src/tenant/timeline.rs | 44 ++++++++++------ 8 files changed, 98 insertions(+), 55 deletions(-) diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index ba6ae7c205a8..0ba5379d1c31 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,5 +1,7 @@ +use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; +use pageserver::task_mgr::TaskKind; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName}; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; @@ -106,15 +108,15 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> // a project where we have run pgbench many timmes. The pgbench database was initialized // between each test run. fn bench_from_captest_env(c: &mut Criterion) { + let ctx = RequestContext::new(TaskKind::Benchmark, DownloadBehavior::Error); // TODO consider compressing this file let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); - // Test with uniform query pattern c.bench_function("captest_uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1); + layer_map.search(q.0, q.1, &ctx); } }); }); @@ -126,6 +128,7 @@ fn bench_from_captest_env(c: &mut Criterion) { Key::from_hex("000000067F00008000000000000000000001").unwrap(), // This LSN is higher than any of the LSNs in the tree Lsn::from_str("D0/80208AE1").unwrap(), + &ctx, ); result.unwrap(); }); @@ -135,6 +138,7 @@ fn bench_from_captest_env(c: &mut Criterion) { // Benchmark using metadata extracted from a real project that was taknig // too long processing layer map queries. fn bench_from_real_project(c: &mut Criterion) { + let ctx = RequestContext::new(TaskKind::Benchmark, DownloadBehavior::Error); // Init layer map let now = Instant::now(); let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); @@ -157,12 +161,13 @@ fn bench_from_real_project(c: &mut Criterion) { println!("running correctness check"); let now = Instant::now(); - let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning); + let result_bruteforce = + layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning, &ctx); assert!(result_bruteforce.len() == partitioning.parts.len()); println!("Finished bruteforce in {:?}", now.elapsed()); let now = Instant::now(); - let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None); + let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None, &ctx); assert!(result_fast.len() == partitioning.parts.len()); println!("Finished fast in {:?}", now.elapsed()); @@ -183,13 +188,13 @@ fn bench_from_real_project(c: &mut Criterion) { group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1); + layer_map.search(q.0, q.1, &ctx); } }); }); group.bench_function("get_difficulty_map", |b| { b.iter(|| { - layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3)); + layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3), &ctx); }); }); group.finish(); @@ -197,6 +202,7 @@ fn bench_from_real_project(c: &mut Criterion) { // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines. fn bench_sequential(c: &mut Criterion) { + let ctx = RequestContext::new(TaskKind::Benchmark, DownloadBehavior::Error); // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines. // // TODO This code is pretty slow and runs even if we're only running other @@ -232,7 +238,7 @@ fn bench_sequential(c: &mut Criterion) { group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1); + layer_map.search(q.0, q.1, &ctx); } }); }); diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index c4f213e75572..15225dddb5fa 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -264,6 +264,8 @@ pub enum TaskKind { DebugTool, + Benchmark, + #[cfg(test)] UnitTest, } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index bc943372f8fe..7a1aac44aa81 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -459,7 +459,7 @@ impl Tenant { local_metadata: Option, ancestor: Option>, first_save: bool, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_id; @@ -528,6 +528,7 @@ impl Tenant { .reconcile_with_remote( up_to_date_metadata, remote_startup_data.as_ref().map(|r| &r.index_part), + ctx, ) .await .context("failed to reconcile with remote")? @@ -1954,7 +1955,7 @@ impl Tenant { // made. break; } - let result = timeline.gc().await?; + let result = timeline.gc(ctx).await?; totals += result; } @@ -3426,7 +3427,7 @@ mod tests { .await?; tline.freeze_and_flush().await?; tline.compact(&ctx).await?; - tline.gc().await?; + tline.gc(&ctx).await?; } Ok(()) @@ -3498,7 +3499,7 @@ mod tests { .await?; tline.freeze_and_flush().await?; tline.compact(&ctx).await?; - tline.gc().await?; + tline.gc(&ctx).await?; } Ok(()) @@ -3582,7 +3583,7 @@ mod tests { .await?; tline.freeze_and_flush().await?; tline.compact(&ctx).await?; - tline.gc().await?; + tline.gc(&ctx).await?; } Ok(()) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 798e585b40e4..57729db7826b 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -220,7 +220,7 @@ where /// NOTE: This only searches the 'historic' layers, *not* the /// 'open' and 'frozen' layers! /// - pub fn search(&self, key: Key, end_lsn: Lsn) -> Option> { + pub fn search(&self, key: Key, end_lsn: Lsn, ctx: &RequestContext) -> Option> { let mut version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; let mut latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); @@ -237,7 +237,7 @@ where } (Some(delta), None) => { let lsn_floor = delta.get_lsn_range().start; - if let Ok(contains) = delta.overlaps(&singleton_range(key)) { + if let Ok(contains) = delta.overlaps(&singleton_range(key), ctx) { if !contains { version = self.historic.get().unwrap().get_version(lsn_floor.0 - 1)?; latest_delta = version.delta_coverage.query(key.to_i128()); @@ -259,7 +259,7 @@ where lsn_floor: img_lsn, }) } else { - if let Ok(contains) = delta.overlaps(&singleton_range(key)) { + if let Ok(contains) = delta.overlaps(&singleton_range(key), ctx) { if !contains { version = self .historic @@ -495,8 +495,12 @@ where /// TODO The optimal number should probably be slightly higher than 1, but to /// implement that we need to plumb a lot more context into this function /// than just the current partition_range. - pub fn is_reimage_worthy(layer: &L, partition_range: &Range) -> Result { - if !layer.overlaps(partition_range)? { + pub fn is_reimage_worthy( + layer: &L, + partition_range: &Range, + ctx: &RequestContext, + ) -> Result { + if !layer.overlaps(partition_range, ctx)? { return Ok(false); } // Case 1 @@ -525,6 +529,7 @@ where key: &Range, lsn: &Range, limit: Option, + ctx: &RequestContext, ) -> Result { // We get the delta coverage of the region, and for each part of the coverage // we recurse right underneath the delta. The recursion depth is limited by @@ -556,10 +561,10 @@ where let kr = Key::from_i128(current_key)..Key::from_i128(change_key); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key)? as usize; + let base_count = Self::is_reimage_worthy(&val, key, ctx)? as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = - self.count_deltas(&kr, &lr, new_limit)?; + self.count_deltas(&kr, &lr, new_limit, ctx)?; max_stacked_deltas = std::cmp::max( max_stacked_deltas, base_count + max_stacked_deltas_underneath, @@ -578,9 +583,10 @@ where let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key)? as usize; + let base_count = Self::is_reimage_worthy(&val, key, ctx)? as usize; let new_limit = limit.map(|l| l - base_count); - let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?; + let max_stacked_deltas_underneath = + self.count_deltas(&kr, &lr, new_limit, ctx)?; max_stacked_deltas = std::cmp::max( max_stacked_deltas, base_count + max_stacked_deltas_underneath, @@ -597,13 +603,19 @@ where /// The `partition_range` argument is used as context for the reimage-worthiness decision. /// /// Used as a helper for correctness checks only. Performance not critical. - pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range) -> usize { - match self.search(key, lsn) { + pub fn get_difficulty( + &self, + lsn: Lsn, + key: Key, + partition_range: &Range, + ctx: &RequestContext, + ) -> usize { + match self.search(key, lsn, ctx) { Some(search_result) => { if search_result.layer.is_incremental() { - (Self::is_reimage_worthy(&search_result.layer, partition_range).unwrap() + (Self::is_reimage_worthy(&search_result.layer, partition_range, ctx).unwrap() as usize) - + self.get_difficulty(search_result.lsn_floor, key, partition_range) + + self.get_difficulty(search_result.lsn_floor, key, partition_range, ctx) } else { 0 } @@ -618,6 +630,7 @@ where &self, lsn: Lsn, partitioning: &KeyPartitioning, + ctx: &RequestContext, ) -> Vec { // Looking at the difficulty as a function of key, it could only increase // when a delta layer starts or an image layer ends. Therefore it's sufficient @@ -652,8 +665,10 @@ where // TODO assert it for range in &part.ranges { if !range.is_empty() { - difficulty = - std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range)); + difficulty = std::cmp::max( + difficulty, + self.get_difficulty(lsn, range.start, range, ctx), + ); } while let Some(key) = keys_iter.peek() { if key >= &range.end { @@ -664,7 +679,7 @@ where continue; } difficulty = - std::cmp::max(difficulty, self.get_difficulty(lsn, key, range)); + std::cmp::max(difficulty, self.get_difficulty(lsn, key, range, ctx)); } } difficulty @@ -689,6 +704,7 @@ where lsn: Lsn, partitioning: &KeyPartitioning, limit: Option, + ctx: &RequestContext, ) -> Vec { // TODO This is a naive implementation. Perf improvements to do: // 1. Instead of calling self.image_coverage and self.count_deltas, @@ -717,7 +733,7 @@ where if img_lsn < lsn { let num_deltas = self - .count_deltas(&img_range, &(img_lsn..lsn), limit) + .count_deltas(&img_range, &(img_lsn..lsn), limit, ctx) .expect("why would this err lol?"); difficulty = std::cmp::max(difficulty, num_deltas); } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 2cdaa79e1060..50ea66c63317 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -289,12 +289,12 @@ pub trait Layer: std::fmt::Debug + Send + Sync { fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>; /// Checks if layer contains any entries belonging to the specified key range - fn overlaps(&self, key_range: &Range) -> Result { + fn overlaps(&self, key_range: &Range, _ctx: &RequestContext) -> Result { Ok(range_overlaps(&self.get_key_range(), key_range)) } /// Skip holes in this layer key range - fn get_occupied_ranges(&self) -> Result>> { + fn get_occupied_ranges(&self, _ctx: &RequestContext) -> Result>> { Ok(vec![self.get_key_range()]) } @@ -303,7 +303,7 @@ pub trait Layer: std::fmt::Debug + Send + Sync { /// Only delta layers can contain holes. Image is consdered as always dense, despite to the fact that it doesn't /// contain all possible key values in the specified range: there are may be no keys in the storage belonging /// to the image layer range but not present in the image layer. - fn get_holes(&self) -> Result>> { + fn get_holes(&self, _ctx: &RequestContext) -> Result>> { Ok(None) } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 91b447dcf32c..86fd3226a594 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -337,8 +337,8 @@ impl Layer for DeltaLayer { Ok(()) } - fn get_occupied_ranges(&self) -> Result>> { - let inner = self.load()?; + fn get_occupied_ranges(&self, ctx: &RequestContext) -> Result>> { + let inner = self.load(ctx)?; if let Some(holes) = &inner.holes { let mut occ = Vec::with_capacity(holes.len() + 1); let key_range = self.get_key_range(); @@ -354,16 +354,16 @@ impl Layer for DeltaLayer { } } - fn get_holes(&self) -> Result>> { - let inner = self.load()?; + fn get_holes(&self, ctx: &RequestContext) -> Result>> { + let inner = self.load(ctx)?; Ok(inner.holes.clone()) } - fn overlaps(&self, key_range: &Range) -> anyhow::Result { + fn overlaps(&self, key_range: &Range, ctx: &RequestContext) -> anyhow::Result { if !range_overlaps(&self.key_range, key_range) { Ok(false) } else { - let inner = self.load()?; + let inner = self.load(ctx)?; if let Some(holes) = &inner.holes { let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) { diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 3c4527d7292c..ef911aa22269 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -89,7 +89,7 @@ impl Layer for RemoteLayer { self.is_incremental } - fn get_occupied_ranges(&self) -> Result>> { + fn get_occupied_ranges(&self, _ctx: &RequestContext) -> Result>> { if let Some(holes) = &self.layer_metadata.holes() { let mut occ = Vec::with_capacity(holes.len() + 1); let key_range = self.get_key_range(); @@ -105,11 +105,11 @@ impl Layer for RemoteLayer { } } - fn get_holes(&self) -> Result>> { + fn get_holes(&self, _ctx: &RequestContext) -> Result>> { Ok(self.layer_metadata.holes()) } - fn overlaps(&self, key_range: &Range) -> anyhow::Result { + fn overlaps(&self, key_range: &Range, _ctx: &RequestContext) -> anyhow::Result { if !range_overlaps(&self.key_range, key_range) { Ok(false) } else if let Some(holes) = &self.layer_metadata.holes() { @@ -278,6 +278,7 @@ impl RemoteLayer { &self, conf: &'static PageServerConf, file_size: u64, + ctx: &RequestContext, ) -> Arc { if self.is_delta { let fname = DeltaFileName { @@ -293,6 +294,7 @@ impl RemoteLayer { self.access_stats .clone_for_residence_change(LayerResidenceStatus::Resident), self.get_holes().unwrap(), + self.get_holes(ctx).unwrap(), )) } else { let fname = ImageFileName { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0ac278ae861a..888c847a36fb 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1503,11 +1503,12 @@ impl Timeline { /// # TODO /// May be a bit cleaner to do things based on populated remote client, /// and then do things based on its upload_queue.latest_files. - #[instrument(skip(self, index_part, up_to_date_metadata))] + #[instrument(skip(self, index_part, up_to_date_metadata, ctx))] pub async fn reconcile_with_remote( &self, up_to_date_metadata: &TimelineMetadata, index_part: Option<&IndexPart>, + ctx: &RequestContext, ) -> anyhow::Result<()> { info!("starting"); let remote_client = self @@ -1555,7 +1556,7 @@ impl Timeline { info!("scheduling {layer_path:?} for upload"); remote_client.schedule_layer_file_upload( layer_name, - &LayerFileMetadata::new(layer_size, layer.get_holes()?), + &LayerFileMetadata::new(layer_size, layer.get_holes(ctx)?), )?; } remote_client.schedule_index_upload_for_file_changes()?; @@ -2039,7 +2040,9 @@ impl Timeline { } } - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { + if let Some(SearchResult { lsn_floor, layer }) = + layers.search(key, cont_lsn, ctx) + { // If it's a remote layer, download it and retry. if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) @@ -2362,7 +2365,7 @@ impl Timeline { .await? } else { // normal case, write out a L0 delta layer file. - let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?; + let (delta_path, metadata) = self.create_delta_layer(&frozen_layer, ctx)?; HashMap::from([(delta_path, metadata)]) }; @@ -2468,6 +2471,7 @@ impl Timeline { fn create_delta_layer( &self, frozen_layer: &InMemoryLayer, + ctx: &RequestContext, ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> { // Write it out let new_delta = frozen_layer.write_to_disk()?; @@ -2487,7 +2491,7 @@ impl Timeline { self.conf.timeline_path(&self.timeline_id, &self.tenant_id), ])?; - let holes = new_delta.get_holes()?; + let holes = new_delta.get_holes(ctx)?; // Add it to the layer map self.layers @@ -2535,7 +2539,12 @@ impl Timeline { } // Is it time to create a new image layer for the given partition? - fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result { + fn time_for_new_image_layer( + &self, + partition: &KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { let layers = self.layers.read().unwrap(); for part_range in &partition.ranges { @@ -2561,7 +2570,7 @@ impl Timeline { if img_lsn < lsn { let threshold = self.get_image_creation_threshold(); let num_deltas = - layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?; + layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold), ctx)?; debug!( "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", @@ -2587,7 +2596,7 @@ impl Timeline { let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers: Vec = Vec::new(); for partition in partitioning.parts.iter() { - if force || self.time_for_new_image_layer(partition, lsn)? { + if force || self.time_for_new_image_layer(partition, lsn, ctx)? { let img_range = partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; let mut image_layer_writer = ImageLayerWriter::new( @@ -2993,7 +3002,7 @@ impl Timeline { if let Some(remote_client) = &self.remote_client { remote_client.schedule_layer_file_upload( &l.filename(), - &LayerFileMetadata::new(metadata.len(), l.get_holes()?), + &LayerFileMetadata::new(metadata.len(), l.get_holes(ctx)?), )?; } @@ -3004,7 +3013,7 @@ impl Timeline { new_layer_paths.insert( new_delta_path, - LayerFileMetadata::new(metadata.len(), l.get_holes()?), + LayerFileMetadata::new(metadata.len(), l.get_holes(ctx)?), ); let x: Arc = Arc::new(l); updates.insert_historic(x); @@ -3130,7 +3139,7 @@ impl Timeline { /// within a layer file. We can only remove the whole file if it's fully /// obsolete. /// - pub(super) async fn gc(&self) -> anyhow::Result { + pub(super) async fn gc(&self, ctx: &RequestContext) -> anyhow::Result { let timer = self.metrics.garbage_collect_histo.start_timer(); fail_point!("before-timeline-gc"); @@ -3160,6 +3169,7 @@ impl Timeline { pitr_cutoff, retain_lsns, new_gc_cutoff, + ctx, ) .instrument( info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff), @@ -3179,6 +3189,7 @@ impl Timeline { pitr_cutoff: Lsn, retain_lsns: Vec, new_gc_cutoff: Lsn, + ctx: &RequestContext, ) -> anyhow::Result { let now = SystemTime::now(); let mut result: GcResult = GcResult::default(); @@ -3300,7 +3311,7 @@ impl Timeline { // If GC horizon is at 2500, we can remove layers A and B, but // we cannot remove C, even though it's older than 2500, because // the delta layer 2000-3000 depends on it. - for occupied_range in &l.get_occupied_ranges()? { + for occupied_range in &l.get_occupied_ranges(ctx)? { if !layers .image_layer_exists(occupied_range, &(l.get_lsn_range().end..new_gc_cutoff))? { @@ -3489,6 +3500,10 @@ impl Timeline { false, async move { let remote_client = self_clone.remote_client.as_ref().unwrap(); + let ctx = RequestContext::new( + TaskKind::DownloadAllRemoteLayers, + DownloadBehavior::Download, + ); // Does retries + exponential back-off internally. // When this fails, don't layer further retry attempts here. @@ -3503,7 +3518,8 @@ impl Timeline { // Download complete. Replace the RemoteLayer with the corresponding // Delta- or ImageLayer in the layer map. - let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size); + let new_layer = + remote_layer.create_downloaded_layer(self_clone.conf, *size, &ctx); // Update the metadata for given layer file. The remote index file // might be missing some information for the file; this allows us @@ -3512,7 +3528,7 @@ impl Timeline { || (remote_layer.is_delta && remote_layer.layer_metadata.holes().is_none()) { let new_metadata = - LayerFileMetadata::new(*size, new_layer.get_holes().unwrap()); + LayerFileMetadata::new(*size, new_layer.get_holes(&ctx).unwrap()); remote_client .upgrade_layer_metadata( &remote_layer.file_name, From 7c3c2712e52ac6cadcdeda0d025bd79b59975320 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 10:08:24 +0200 Subject: [PATCH 25/44] Store historic layers in separate set --- pageserver/src/tenant.rs | 15 +- pageserver/src/tenant/layer_map.rs | 269 ++++++++++-------- .../layer_map/historic_layer_coverage.rs | 11 - pageserver/src/tenant/timeline.rs | 64 ++--- 4 files changed, 175 insertions(+), 184 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7a1aac44aa81..936126c51f6f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -176,9 +176,9 @@ impl UninitializedTimeline<'_> { /// /// The new timeline is initialized in Active state, and its background jobs are /// started - pub fn initialize(self, _ctx: &RequestContext) -> anyhow::Result> { + pub fn initialize(self, ctx: &RequestContext) -> anyhow::Result> { let mut timelines = self.owning_tenant.timelines.lock().unwrap(); - self.initialize_with_lock(&mut timelines, true, true) + self.initialize_with_lock(&mut timelines, true, true, ctx) } /// Like `initialize`, but the caller is already holding lock on Tenant::timelines. @@ -191,6 +191,7 @@ impl UninitializedTimeline<'_> { timelines: &mut HashMap>, load_layer_map: bool, activate: bool, + ctx: &RequestContext, ) -> anyhow::Result> { let timeline_id = self.timeline_id; let tenant_id = self.owning_tenant.tenant_id; @@ -211,7 +212,7 @@ impl UninitializedTimeline<'_> { Entry::Vacant(v) => { if load_layer_map { new_timeline - .load_layer_map(new_disk_consistent_lsn) + .load_layer_map(new_disk_consistent_lsn, ctx) .with_context(|| { format!( "Failed to load layermap for timeline {tenant_id}/{timeline_id}" @@ -494,7 +495,7 @@ impl Tenant { // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver // will ingest data which may require looking at the layers which are not yet available locally - match timeline.initialize_with_lock(&mut timelines_accessor, true, false) { + match timeline.initialize_with_lock(&mut timelines_accessor, true, false, ctx) { Ok(new_timeline) => new_timeline, Err(e) => { error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}"); @@ -2079,7 +2080,7 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result> { let src_id = src_timeline.timeline_id; @@ -2172,7 +2173,7 @@ impl Tenant { false, Some(Arc::clone(src_timeline)), )? - .initialize_with_lock(&mut timelines, true, true)?; + .initialize_with_lock(&mut timelines, true, true, ctx)?; drop(timelines); info!("branched timeline {dst_id} from {src_id} at {start_lsn}"); @@ -2273,7 +2274,7 @@ impl Tenant { let timeline = { let mut timelines = self.timelines.lock().unwrap(); - raw_timeline.initialize_with_lock(&mut timelines, false, true)? + raw_timeline.initialize_with_lock(&mut timelines, false, true, ctx)? }; info!( diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 57729db7826b..3ab93f15b225 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -49,11 +49,12 @@ mod layer_coverage; use crate::context::RequestContext; use crate::keyspace::KeyPartitioning; use crate::metrics::NUM_ONDISK_LAYERS; -use crate::repository::{singleton_range, Key}; +use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; use anyhow::Result; -use std::collections::VecDeque; +use std::collections::{HashSet, VecDeque}; +use std::hash::{Hash, Hasher}; use std::ops::Range; use std::sync::Arc; use utils::lsn::Lsn; @@ -63,6 +64,22 @@ pub use historic_layer_coverage::Replacement; use super::storage_layer::range_eq; +struct LayerRef(Arc); + +impl PartialEq for LayerRef { + fn eq(&self, other: &LayerRef) -> bool { + Arc::ptr_eq(&self.0, &other.0) + } +} + +impl Eq for LayerRef {} + +impl Hash for LayerRef { + fn hash(&self, state: &mut H) { + Arc::into_raw(Arc::clone(&self.0)).hash(state) + } +} + /// /// LayerMap tracks what layers exist on a timeline. /// @@ -89,9 +106,11 @@ pub struct LayerMap { /// Index of the historic layers optimized for search historic: BufferedHistoricLayerCoverage>, - /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. - /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. - l0_delta_layers: Vec>, + /// L0 layers are kept in separate set because we need fast way to iterate through L0 layers in compaction + l0_delta_layers: HashSet>, + + /// `historic` tree may contain maultuple reference to one layer, so lets maintain separate set instead of eliminating duplicates + historic_layers: HashSet>, } impl Default for LayerMap { @@ -100,8 +119,9 @@ impl Default for LayerMap { open_layer: None, next_open_layer_at: None, frozen_layers: VecDeque::default(), - l0_delta_layers: Vec::default(), historic: BufferedHistoricLayerCoverage::default(), + l0_delta_layers: HashSet::default(), + historic_layers: HashSet::default(), } } } @@ -126,8 +146,8 @@ where /// /// Insert an on-disk layer. /// - pub fn insert_historic(&mut self, layer: Arc) { - self.layer_map.insert_historic_noflush(layer) + pub fn insert_historic(&mut self, layer: Arc, ctx: &RequestContext) -> Result<()> { + self.layer_map.insert_historic_noflush(layer, ctx) } /// @@ -135,8 +155,8 @@ where /// /// This should be called when the corresponding file on disk has been deleted. /// - pub fn remove_historic(&mut self, layer: Arc) { - self.layer_map.remove_historic_noflush(layer) + pub fn remove_historic(&mut self, layer: Arc, ctx: &RequestContext) { + self.layer_map.remove_historic_noflush(layer, ctx) } /// Replaces existing layer iff it is the `expected`. @@ -153,8 +173,9 @@ where &mut self, expected: &Arc, new: Arc, - ) -> anyhow::Result>> { - self.layer_map.replace_historic_noflush(expected, new) + ctx: &RequestContext, + ) -> anyhow::Result { + self.layer_map.replace_historic_noflush(expected, new, ctx) } // We will flush on drop anyway, but this method makes it @@ -220,67 +241,45 @@ where /// NOTE: This only searches the 'historic' layers, *not* the /// 'open' and 'frozen' layers! /// - pub fn search(&self, key: Key, end_lsn: Lsn, ctx: &RequestContext) -> Option> { - let mut version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; - let mut latest_delta = version.delta_coverage.query(key.to_i128()); + pub fn search(&self, key: Key, end_lsn: Lsn) -> Option> { + let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; + let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); - loop { - return match (latest_delta, latest_image.clone()) { - (None, None) => None, - (None, Some(image)) => { - let lsn_floor = image.get_lsn_range().start; + match (latest_delta, latest_image.clone()) { + (None, None) => None, + (None, Some(image)) => { + let lsn_floor = image.get_lsn_range().start; + Some(SearchResult { + layer: image, + lsn_floor, + }) + } + (Some(delta), None) => { + let lsn_floor = delta.get_lsn_range().start; + Some(SearchResult { + layer: delta, + lsn_floor, + }) + } + (Some(delta), Some(image)) => { + let img_lsn = image.get_lsn_range().start; + let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; + let image_exact_match = img_lsn + 1 == end_lsn; + if image_is_newer || image_exact_match { Some(SearchResult { layer: image, - lsn_floor, + lsn_floor: img_lsn, }) - } - (Some(delta), None) => { - let lsn_floor = delta.get_lsn_range().start; - if let Ok(contains) = delta.overlaps(&singleton_range(key), ctx) { - if !contains { - version = self.historic.get().unwrap().get_version(lsn_floor.0 - 1)?; - latest_delta = version.delta_coverage.query(key.to_i128()); - continue; - } - } + } else { + let lsn_floor = + std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); Some(SearchResult { layer: delta, lsn_floor, }) } - (Some(delta), Some(image)) => { - let img_lsn = image.get_lsn_range().start; - let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; - let image_exact_match = img_lsn + 1 == end_lsn; - if image_is_newer || image_exact_match { - Some(SearchResult { - layer: image, - lsn_floor: img_lsn, - }) - } else { - if let Ok(contains) = delta.overlaps(&singleton_range(key), ctx) { - if !contains { - version = self - .historic - .get() - .unwrap() - .get_version(delta.get_lsn_range().start.0 - 1)?; - latest_delta = version.delta_coverage.query(key.to_i128()); - continue; - } - } - let lsn_floor = std::cmp::max( - delta.get_lsn_range().start, - image.get_lsn_range().start + 1, - ); - Some(SearchResult { - layer: delta, - lsn_floor, - }) - } - } - }; + } } } @@ -294,17 +293,28 @@ where /// /// Helper function for BatchedUpdates::insert_historic /// - pub(self) fn insert_historic_noflush(&mut self, layer: Arc) { - self.historic.insert( - historic_layer_coverage::LayerKey::from(&*layer), - Arc::clone(&layer), - ); - + pub(self) fn insert_historic_noflush( + &mut self, + layer: Arc, + ctx: &RequestContext, + ) -> Result<()> { + let lr = layer.get_lsn_range(); + for kr in layer.get_occupied_ranges(ctx)? { + self.historic.insert( + historic_layer_coverage::LayerKey { + key: kr.start.to_i128()..kr.end.to_i128(), + lsn: lr.start.0..lr.end.0, + is_image: !layer.is_incremental(), + }, + Arc::clone(&layer), + ); + } if Self::is_l0(&layer) { - self.l0_delta_layers.push(layer); + self.l0_delta_layers.insert(LayerRef(layer.clone())); } - + self.historic_layers.insert(LayerRef(layer)); NUM_ONDISK_LAYERS.inc(); + Ok(()) } /// @@ -312,24 +322,22 @@ where /// /// Helper function for BatchedUpdates::remove_historic /// - pub fn remove_historic_noflush(&mut self, layer: Arc) { - self.historic - .remove(historic_layer_coverage::LayerKey::from(&*layer)); - + pub fn remove_historic_noflush(&mut self, layer: Arc, ctx: &RequestContext) { + let lr = layer.get_lsn_range(); + for kr in layer.get_occupied_ranges(ctx)? { + assert!(self.historic.remove( + historic_layer_coverage::LayerKey { + key: kr.start.to_i128()..kr.end.to_i128(), + lsn: lr.start.0..lr.end.0, + is_image: !layer.is_incremental(), + }, + Arc::clone(&layer), + }); + } if Self::is_l0(&layer) { - let len_before = self.l0_delta_layers.len(); - self.l0_delta_layers - .retain(|other| !Self::compare_arced_layers(other, &layer)); - // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers, - // there's a chance that the comparison fails at runtime due to it comparing (pointer, - // vtable) pairs. - assert_eq!( - self.l0_delta_layers.len(), - len_before - 1, - "failed to locate removed historic layer from l0_delta_layers" - ); + assert!(self.l0_delta_layers.remove(&LayerRef(layer.clone()))); } - + assert!(self.historic_layers.remove(&LayerRef(layer))); NUM_ONDISK_LAYERS.dec(); } @@ -337,7 +345,8 @@ where &mut self, expected: &Arc, new: Arc, - ) -> anyhow::Result>> { + ctx: &RequestContext, + ) -> anyhow::Result { let key = historic_layer_coverage::LayerKey::from(&**expected); let other = historic_layer_coverage::LayerKey::from(&*new); @@ -354,29 +363,53 @@ where "expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}" ); - let l0_index = if expected_l0 { - // find the index in case replace worked, we need to replace that as well - Some( - self.l0_delta_layers - .iter() - .position(|slot| Self::compare_arced_layers(slot, expected)) - .ok_or_else(|| anyhow::anyhow!("existing l0 delta layer was not found"))?, - ) - } else { - None - }; - - let replaced = self.historic.replace(&key, new.clone(), |existing| { - Self::compare_arced_layers(existing, expected) - }); - - if let Replacement::Replaced { .. } = &replaced { - if let Some(index) = l0_index { - self.l0_delta_layers[index] = new; - } - } - - Ok(replaced) + if self.histori_layers.remove(&LayerRef(expected.clone())) { + let lr = layer.get_lsn_range(); + for kr in layer.get_occupied_ranges(ctx)? { + match self.historic.replace(&historic_layer_coverage::LayerKey { + key: kr.start.to_i128()..kr.end.to_i128(), + lsn: lr.start.0..lr.end.0, + is_image: !layer.is_incremental(), + }, key, new.clone(), |existing| { + Self::compare_arced_layers(existing, expected) + })? => { + Replacement::Replaced { .. } => { /* expected */ } + Replacement::NotFound => { + // TODO: the downloaded file should probably be removed, otherwise + // it will be added to the layermap on next load? we should + // probably restart any get_reconstruct_data search as well. + // + // See: https://github.com/neondatabase/neon/issues/3533 + error!("replacing downloaded layer into layermap failed because layer was not found"); + } + Replacement::RemovalBuffered => { + unreachable!("current implementation does not remove anything") + } + Replacement::Unexpected(other) => { + // if the other layer would have the same pointer value as + // expected, it means they differ only on vtables. + // + // otherwise there's no known reason for this to happen as + // compacted layers should have different covering rectangle + // leading to produce Replacement::NotFound. + + error!( + expected.ptr = ?Arc::as_ptr(&l), + other.ptr = ?Arc::as_ptr(&other), + "replacing downloaded layer into layermap failed because another layer was found instead of expected" + ); + } + }; + } + if expected_l0 { + anyhow::ensure!(self.l0_delta_layers.remove(&LayerRef(expected.clone())), + "existing l0 delta layer was not found"); + self.l0_delta_layers.insert(&LayerRed(expected)); + } + Ok(true) + } else { + Ok(false) + } } /// Helper function for BatchedUpdates::drop. @@ -425,7 +458,7 @@ where } pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { - self.historic.iter() + self.historic_layers.iter().map(|r| r.0.clone()) } /// @@ -610,7 +643,7 @@ where partition_range: &Range, ctx: &RequestContext, ) -> usize { - match self.search(key, lsn, ctx) { + match self.search(key, lsn) { Some(search_result) => { if search_result.layer.is_incremental() { (Self::is_reimage_worthy(&search_result.layer, partition_range, ctx).unwrap() @@ -746,7 +779,7 @@ where /// Return all L0 delta layers pub fn get_level0_deltas(&self) -> Result>> { - Ok(self.l0_delta_layers.clone()) + Ok(self.l0_delta_layers.iter().map(|r| r.0.clone()).collect()) } /// debugging function to print out the contents of the layer map @@ -853,14 +886,10 @@ mod tests { map.batch_update().insert_historic(remote.clone()); assert_eq!(count_layer_in(&map, &remote), expected_in_counts); - let replaced = map + assert!(map .batch_update() .replace_historic(&remote, downloaded.clone()) - .expect("name derived attributes are the same"); - assert!( - matches!(replaced, Replacement::Replaced { .. }), - "{replaced:?}" - ); + .expect("name derived attributes are the same")); assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts); map.batch_update().remove_historic(downloaded.clone()); diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index b63c36131465..03eb860e56e3 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -521,17 +521,6 @@ impl BufferedHistoricLayerCoverage { ) } - /// Iterate all the layers - pub fn iter(&self) -> impl '_ + Iterator { - // NOTE we can actually perform this without rebuilding, - // but it's not necessary for now. - if !self.buffer.is_empty() { - panic!("rebuild pls") - } - - self.layers.values().cloned() - } - /// Return a reference to a queryable map, assuming all updates /// have already been processed using self.rebuild() pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage> { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 888c847a36fb..b1099670b2cc 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1240,7 +1240,11 @@ impl Timeline { /// Scan the timeline directory to populate the layer map. /// Returns all timeline-related files that were found and loaded. /// - pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { + pub(super) fn load_layer_map( + &self, + disk_consistent_lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result<()> { let mut layers = self.layers.write().unwrap(); let mut updates = layers.batch_update(); let mut num_layers = 0; @@ -1284,7 +1288,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer), ctx)?; num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { // Create a DeltaLayer struct for each delta file. @@ -1317,7 +1321,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer), ctx)?; num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { // ignore these @@ -1364,6 +1368,7 @@ impl Timeline { index_part: &IndexPart, local_layers: HashMap>, up_to_date_disk_consistent_lsn: Lsn, + ctx: &RequestContext, ) -> anyhow::Result>> { // Are we missing some files that are present in remote storage? // Create RemoteLayer instances for them. @@ -1454,7 +1459,7 @@ impl Timeline { ); let remote_layer = Arc::new(remote_layer); - updates.insert_historic(remote_layer); + updates.insert_historic(remote_layer, ctx)?; } LayerFileName::Delta(deltafilename) => { // Create a RemoteLayer for the delta file. @@ -1478,7 +1483,7 @@ impl Timeline { LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted), ); let remote_layer = Arc::new(remote_layer); - updates.insert_historic(remote_layer); + updates.insert_historic(remote_layer, ctx)?; } } } @@ -1533,7 +1538,7 @@ impl Timeline { index_part.timeline_layers.len() ); remote_client.init_upload_queue(index_part)?; - self.create_remote_layers(index_part, local_layers, disk_consistent_lsn) + self.create_remote_layers(index_part, local_layers, disk_consistent_lsn, ctx) .await? } None => { @@ -2040,9 +2045,7 @@ impl Timeline { } } - if let Some(SearchResult { lsn_floor, layer }) = - layers.search(key, cont_lsn, ctx) - { + if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { // If it's a remote layer, download it and retry. if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) @@ -2498,7 +2501,7 @@ impl Timeline { .write() .unwrap() .batch_update() - .insert_historic(Arc::new(new_delta)); + .insert_historic(Arc::new(new_delta), ctx)?; // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); @@ -2686,7 +2689,7 @@ impl Timeline { self.metrics .resident_physical_size_gauge .add(metadata.len()); - updates.insert_historic(Arc::new(l)); + updates.insert_historic(Arc::new(l), ctx)?; } updates.flush(); drop(layers); @@ -3016,7 +3019,7 @@ impl Timeline { LayerFileMetadata::new(metadata.len(), l.get_holes(ctx)?), ); let x: Arc = Arc::new(l); - updates.insert_historic(x); + updates.insert_historic(x, ctx)?; } // Now that we have reshuffled the data to set of new delta layers, we can @@ -3541,41 +3544,10 @@ impl Timeline { let mut layers = self_clone.layers.write().unwrap(); let mut updates = layers.batch_update(); { - use crate::tenant::layer_map::Replacement; let l: Arc = remote_layer.clone(); - match updates.replace_historic(&l, new_layer) { - Ok(Replacement::Replaced { .. }) => { /* expected */ } - Ok(Replacement::NotFound) => { - // TODO: the downloaded file should probably be removed, otherwise - // it will be added to the layermap on next load? we should - // probably restart any get_reconstruct_data search as well. - // - // See: https://github.com/neondatabase/neon/issues/3533 - error!("replacing downloaded layer into layermap failed because layer was not found"); - } - Ok(Replacement::RemovalBuffered) => { - unreachable!("current implementation does not remove anything") - } - Ok(Replacement::Unexpected(other)) => { - // if the other layer would have the same pointer value as - // expected, it means they differ only on vtables. - // - // otherwise there's no known reason for this to happen as - // compacted layers should have different covering rectangle - // leading to produce Replacement::NotFound. - - error!( - expected.ptr = ?Arc::as_ptr(&l), - other.ptr = ?Arc::as_ptr(&other), - ?other, - "replacing downloaded layer into layermap failed because another layer was found instead of expected" - ); - } - Err(e) => { - // this is a precondition failure, the layer filename derived - // attributes didn't match up, which doesn't seem likely. - error!("replacing downloaded layer into layermap failed: {e:#?}") - } + if !updates.replace_historic(&l, new_layer, &ctx)? { + // See: https://github.com/neondatabase/neon/issues/3533 + error!("replacing downloaded layer into layermap failed because layer was not found"); } } updates.flush(); From a58b7039000849cb49a41ae0a66c8c790c0cfd0c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 14:54:42 +0200 Subject: [PATCH 26/44] Remove all occupied segments in layer map --- pageserver/src/tenant/layer_map.rs | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 3ab93f15b225..f55a68ae55bf 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -155,6 +155,7 @@ where /// /// This should be called when the corresponding file on disk has been deleted. /// +<<<<<<< HEAD pub fn remove_historic(&mut self, layer: Arc, ctx: &RequestContext) { self.layer_map.remove_historic_noflush(layer, ctx) } @@ -176,6 +177,10 @@ where ctx: &RequestContext, ) -> anyhow::Result { self.layer_map.replace_historic_noflush(expected, new, ctx) +======= + pub fn remove_historic(&mut self, layer: Arc, ctx: &RequestContext) -> Result<()> { + self.layer_map.remove_historic_noflush(layer, ctx) +>>>>>>> 2b84898d (Remove all occupied segments in layer map) } // We will flush on drop anyway, but this method makes it @@ -246,7 +251,7 @@ where let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); - match (latest_delta, latest_image.clone()) { + match (latest_delta, latest_image) { (None, None) => None, (None, Some(image)) => { let lsn_floor = image.get_lsn_range().start; @@ -322,23 +327,22 @@ where /// /// Helper function for BatchedUpdates::remove_historic /// - pub fn remove_historic_noflush(&mut self, layer: Arc, ctx: &RequestContext) { + pub fn remove_historic_noflush(&mut self, layer: Arc, ctx: &RequestContext) -> Result<()> { let lr = layer.get_lsn_range(); for kr in layer.get_occupied_ranges(ctx)? { - assert!(self.historic.remove( - historic_layer_coverage::LayerKey { - key: kr.start.to_i128()..kr.end.to_i128(), - lsn: lr.start.0..lr.end.0, - is_image: !layer.is_incremental(), - }, - Arc::clone(&layer), - }); - } + self.historic.remove(historic_layer_coverage::LayerKey { + key: kr.start.to_i128()..kr.end.to_i128(), + lsn: lr.start.0..lr.end.0, + is_image: !layer.is_incremental(), + }); + } + if Self::is_l0(&layer) { assert!(self.l0_delta_layers.remove(&LayerRef(layer.clone()))); } assert!(self.historic_layers.remove(&LayerRef(layer))); NUM_ONDISK_LAYERS.dec(); + Ok(()) } pub(self) fn replace_historic_noflush( From 161e0d8756696c350f94f66cf1ae7019d8374449 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 14:54:54 +0200 Subject: [PATCH 27/44] Remove all occupied segments in layer map --- pageserver/src/tenant/timeline.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b1099670b2cc..f37613f34a68 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1416,7 +1416,7 @@ impl Timeline { anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { self.metrics.resident_physical_size_gauge.sub(local_size); - updates.remove_historic(local_layer); + updates.remove_historic(local_layer, ctx)?; // fall-through to adding the remote layer } } else { @@ -3027,7 +3027,7 @@ impl Timeline { let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len()); for l in deltas_to_compact { layer_names_to_delete.push(l.filename()); - self.delete_historic_layer(layer_removal_cs, l, &mut updates)?; + self.delete_historic_layer(layer_removal_cs, l, &mut updates, ctx)?; } updates.flush(); drop(layers); From 5e2cea26d2c8207ffdb7b04244629ffaeebc67ac Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 16:22:05 +0200 Subject: [PATCH 28/44] Rebase with main --- pageserver/src/http/routes.rs | 3 +- pageserver/src/tenant/layer_map.rs | 66 +++++++++++++++--------------- pageserver/src/tenant/timeline.rs | 49 ++++++++-------------- 3 files changed, 51 insertions(+), 67 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 229cf96ee37b..39995f9c7ccc 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -570,6 +570,7 @@ async fn layer_download_handler(request: Request) -> Result } async fn evict_timeline_layer_handler(request: Request) -> Result, ApiError> { + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -577,7 +578,7 @@ async fn evict_timeline_layer_handler(request: Request) -> Result, ctx: &RequestContext) { + + pub fn remove_historic(&mut self, layer: Arc, ctx: &RequestContext) -> Result<()> { self.layer_map.remove_historic_noflush(layer, ctx) } @@ -174,13 +174,9 @@ where &mut self, expected: &Arc, new: Arc, - ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result { self.layer_map.replace_historic_noflush(expected, new, ctx) -======= - pub fn remove_historic(&mut self, layer: Arc, ctx: &RequestContext) -> Result<()> { - self.layer_map.remove_historic_noflush(layer, ctx) ->>>>>>> 2b84898d (Remove all occupied segments in layer map) } // We will flush on drop anyway, but this method makes it @@ -349,7 +345,7 @@ where &mut self, expected: &Arc, new: Arc, - ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result { let key = historic_layer_coverage::LayerKey::from(&**expected); let other = historic_layer_coverage::LayerKey::from(&*new); @@ -367,16 +363,18 @@ where "expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}" ); - if self.histori_layers.remove(&LayerRef(expected.clone())) { - let lr = layer.get_lsn_range(); - for kr in layer.get_occupied_ranges(ctx)? { - match self.historic.replace(&historic_layer_coverage::LayerKey { - key: kr.start.to_i128()..kr.end.to_i128(), - lsn: lr.start.0..lr.end.0, - is_image: !layer.is_incremental(), - }, key, new.clone(), |existing| { - Self::compare_arced_layers(existing, expected) - })? => { + if self.historic_layers.remove(&LayerRef(expected.clone())) { + let lr = expected.get_lsn_range(); + for kr in expected.get_occupied_ranges(ctx)? { + match self.historic.replace( + &historic_layer_coverage::LayerKey { + key: kr.start.to_i128()..kr.end.to_i128(), + lsn: lr.start.0..lr.end.0, + is_image: !expected.is_incremental(), + }, + new.clone(), + |existing| Self::compare_arced_layers(existing, expected), + ) { Replacement::Replaced { .. } => { /* expected */ } Replacement::NotFound => { // TODO: the downloaded file should probably be removed, otherwise @@ -384,12 +382,12 @@ where // probably restart any get_reconstruct_data search as well. // // See: https://github.com/neondatabase/neon/issues/3533 - error!("replacing downloaded layer into layermap failed because layer was not found"); + anyhow::bail!("replacing downloaded layer into layermap failed because layer was not found"); } Replacement::RemovalBuffered => { unreachable!("current implementation does not remove anything") } - Replacement::Unexpected(other) => { + Replacement::Unexpected(_other) => { // if the other layer would have the same pointer value as // expected, it means they differ only on vtables. // @@ -397,23 +395,23 @@ where // compacted layers should have different covering rectangle // leading to produce Replacement::NotFound. - error!( - expected.ptr = ?Arc::as_ptr(&l), - other.ptr = ?Arc::as_ptr(&other), + anyhow::bail!( "replacing downloaded layer into layermap failed because another layer was found instead of expected" ); } - }; - } - if expected_l0 { - anyhow::ensure!(self.l0_delta_layers.remove(&LayerRef(expected.clone())), - "existing l0 delta layer was not found"); - self.l0_delta_layers.insert(&LayerRed(expected)); - } - Ok(true) - } else { - Ok(false) - } + }; + } + if expected_l0 { + anyhow::ensure!( + self.l0_delta_layers.remove(&LayerRef(expected.clone())), + "existing l0 delta layer was not found" + ); + self.l0_delta_layers.insert(LayerRef(expected.clone())); + } + Ok(true) + } else { + Ok(false) + } } /// Helper function for BatchedUpdates::drop. diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f37613f34a68..080fb872c984 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -869,7 +869,7 @@ impl Timeline { /// Like [`evict_layer_batch`], but for just one layer. /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`. - pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { + pub async fn evict_layer(&self, layer_file_name: &str, ctx: &RequestContext) -> anyhow::Result> { let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) }; let remote_client = self .remote_client @@ -963,6 +963,7 @@ impl Timeline { local_layer .file_size() .expect("Local layer should have a file size"), + local_layer.get_holes(ctx)?, ); let new_remote_layer = Arc::new(match local_layer.filename() { LayerFileName::Image(image_name) => RemoteLayer::new_img( @@ -985,37 +986,20 @@ impl Timeline { ), }); - let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? { - Replacement::Replaced { .. } => { - let layer_size = local_layer.file_size(); + let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer, &ctx)?; + if replaced { + let layer_size = local_layer.file_size(); - if let Err(e) = local_layer.delete() { - error!("failed to remove layer file on evict after replacement: {e:#?}"); - } - - if let Some(layer_size) = layer_size { - self.metrics.resident_physical_size_gauge.sub(layer_size); - } - - true - } - Replacement::NotFound => { - debug!(evicted=?local_layer, "layer was no longer in layer map"); - false - } - Replacement::RemovalBuffered => { - unreachable!("not doing anything else in this batch") + if let Err(e) = local_layer.delete() { + error!("failed to remove layer file on evict after replacement: {e:#?}"); } - Replacement::Unexpected(other) => { - error!( - local_layer.ptr=?Arc::as_ptr(local_layer), - other.ptr=?Arc::as_ptr(&other), - ?other, - "failed to replace"); - false - } - }; + if let Some(layer_size) = layer_size { + self.metrics.resident_physical_size_gauge.sub(layer_size); + } + } else { + debug!(evicted=?local_layer, "layer was no longer in layer map"); + } Ok(replaced) } } @@ -1838,6 +1822,7 @@ impl Timeline { _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, layer: Arc, updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, + ctx: &RequestContext, ) -> anyhow::Result<()> { let layer_size = layer.file_size(); @@ -1851,7 +1836,7 @@ impl Timeline { // won't be needed for page reconstruction for this timeline, // and mark what we can't delete yet as deleted from the layer // map index without actually rebuilding the index. - updates.remove_historic(layer); + updates.remove_historic(layer, ctx)?; Ok(()) } @@ -3172,7 +3157,7 @@ impl Timeline { pitr_cutoff, retain_lsns, new_gc_cutoff, - ctx, + ctx, ) .instrument( info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff), @@ -3349,7 +3334,7 @@ impl Timeline { { for doomed_layer in layers_to_remove { layer_names_to_delete.push(doomed_layer.filename()); - self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning? + self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates, ctx)?; // FIXME: schedule succeeded deletions before returning? result.layers_removed += 1; } } From e020247d552164770dd32ea5e1b7d4d7bb861364 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 17:18:08 +0200 Subject: [PATCH 29/44] Cleanup after merge with main --- pageserver/benches/bench_layer_map.rs | 13 +++---- pageserver/src/tenant/layer_map.rs | 39 ++++++++++++++----- pageserver/src/tenant/storage_layer.rs | 2 + pageserver/src/tenant/timeline.rs | 4 +- test_runner/regress/test_ondemand_download.py | 1 + 5 files changed, 41 insertions(+), 18 deletions(-) diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 0ba5379d1c31..ef7943d6f4c1 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -18,6 +18,7 @@ use utils::lsn::Lsn; use criterion::{criterion_group, criterion_main, Criterion}; fn build_layer_map(filename_dump: PathBuf) -> LayerMap { + let ctx = RequestContext::new(TaskKind::Benchmark, DownloadBehavior::Error); let mut layer_map = LayerMap::::default(); let mut min_lsn = Lsn(u64::MAX); @@ -35,7 +36,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { min_lsn = min(min_lsn, lsn_range.start); max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1)); - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer), &ctx).unwrap(); } println!("min: {min_lsn}, max: {max_lsn}"); @@ -108,7 +109,6 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> // a project where we have run pgbench many timmes. The pgbench database was initialized // between each test run. fn bench_from_captest_env(c: &mut Criterion) { - let ctx = RequestContext::new(TaskKind::Benchmark, DownloadBehavior::Error); // TODO consider compressing this file let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); @@ -116,7 +116,7 @@ fn bench_from_captest_env(c: &mut Criterion) { c.bench_function("captest_uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1, &ctx); + layer_map.search(q.0, q.1); } }); }); @@ -128,7 +128,6 @@ fn bench_from_captest_env(c: &mut Criterion) { Key::from_hex("000000067F00008000000000000000000001").unwrap(), // This LSN is higher than any of the LSNs in the tree Lsn::from_str("D0/80208AE1").unwrap(), - &ctx, ); result.unwrap(); }); @@ -188,7 +187,7 @@ fn bench_from_real_project(c: &mut Criterion) { group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1, &ctx); + layer_map.search(q.0, q.1); } }); }); @@ -221,7 +220,7 @@ fn bench_sequential(c: &mut Criterion) { is_incremental: false, short_id: format!("Layer {}", i), }; - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer), &ctx).unwrap(); } updates.flush(); println!("Finished layer map init in {:?}", now.elapsed()); @@ -238,7 +237,7 @@ fn bench_sequential(c: &mut Criterion) { group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1, &ctx); + layer_map.search(q.0, q.1); } }); }); diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 5c34d77074cb..4c764fd09d7f 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -363,8 +363,8 @@ where "expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}" ); - if self.historic_layers.remove(&LayerRef(expected.clone())) { - let lr = expected.get_lsn_range(); + let lr = expected.get_lsn_range(); + let replaced = if self.historic_layers.remove(&LayerRef(expected.clone())) { for kr in expected.get_occupied_ranges(ctx)? { match self.historic.replace( &historic_layer_coverage::LayerKey { @@ -406,12 +406,26 @@ where self.l0_delta_layers.remove(&LayerRef(expected.clone())), "existing l0 delta layer was not found" ); - self.l0_delta_layers.insert(LayerRef(expected.clone())); } - Ok(true) + true } else { - Ok(false) + for kr in new.get_occupied_ranges(ctx)? { + self.historic.insert( + historic_layer_coverage::LayerKey { + key: kr.start.to_i128()..kr.end.to_i128(), + lsn: lr.start.0..lr.end.0, + is_image: !new.is_incremental(), + }, + Arc::clone(&new), + ); + } + false + }; + if expected_l0 { + self.l0_delta_layers.insert(LayerRef(new.clone())); } + self.historic_layers.insert(LayerRef(new)); + Ok(replaced) } /// Helper function for BatchedUpdates::drop. @@ -833,7 +847,9 @@ where #[cfg(test)] mod tests { - use super::{LayerMap, Replacement}; + use super::LayerMap; + use crate::context::{DownloadBehavior, RequestContext}; + use crate::task_mgr::TaskKind; use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName}; use std::str::FromStr; use std::sync::Arc; @@ -872,6 +888,7 @@ mod tests { } fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let name = LayerFileName::from_str(layer_name).unwrap(); let skeleton = LayerDescriptor::from(name); @@ -885,16 +902,20 @@ mod tests { let expected_in_counts = (1, usize::from(expected_l0)); - map.batch_update().insert_historic(remote.clone()); + map.batch_update() + .insert_historic(remote.clone(), &ctx) + .expect("historic layer is inserted"); assert_eq!(count_layer_in(&map, &remote), expected_in_counts); assert!(map .batch_update() - .replace_historic(&remote, downloaded.clone()) + .replace_historic(&remote, downloaded.clone(), &ctx) .expect("name derived attributes are the same")); assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts); - map.batch_update().remove_historic(downloaded.clone()); + map.batch_update() + .remove_historic(downloaded.clone(), &ctx) + .expect("downloaded layer is found"); assert_eq!(count_layer_in(&map, &downloaded), (0, 0)); } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 50ea66c63317..e4f60e8d9b26 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -398,6 +398,8 @@ pub struct LayerDescriptor { #[derive(Debug, Clone, PartialEq, Eq)] pub struct Hole(Range); +// Alter default serde serialization for Key class to reduce size of index_part.json file. +// Instead of dumping Key as json object with six field? we just store hex string representing key (as in file name) impl Serialize for Hole { fn serialize(&self, serializer: S) -> Result where diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 080fb872c984..c60c8d9dcdec 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -963,7 +963,7 @@ impl Timeline { local_layer .file_size() .expect("Local layer should have a file size"), - local_layer.get_holes(ctx)?, + local_layer.get_holes(ctx)?, ); let new_remote_layer = Arc::new(match local_layer.filename() { LayerFileName::Image(image_name) => RemoteLayer::new_img( @@ -1822,7 +1822,7 @@ impl Timeline { _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, layer: Arc, updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, - ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result<()> { let layer_size = layer.file_size(); diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index df2f797ab930..7e648ef03196 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -213,6 +213,7 @@ def get_resident_physical_size(): log.info(filled_size) assert filled_current_physical == filled_size, "we don't yet do layer eviction" + # Wait until generated image layers are uploaded to S3 time.sleep(3) env.pageserver.stop() From 88fc5c32d9c8b37db17694072fed1508d940fe8d Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 19:27:00 +0200 Subject: [PATCH 30/44] Minor refactoring --- pageserver/benches/bench_layer_map.rs | 1 + pageserver/src/tenant/layer_map.rs | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index ef7943d6f4c1..44a66031c698 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -112,6 +112,7 @@ fn bench_from_captest_env(c: &mut Criterion) { // TODO consider compressing this file let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); + // Test with uniform query pattern c.bench_function("captest_uniform_queries", |b| { b.iter(|| { diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 4c764fd09d7f..c825b61df7a0 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -364,7 +364,8 @@ where ); let lr = expected.get_lsn_range(); - let replaced = if self.historic_layers.remove(&LayerRef(expected.clone())) { + let replaced = self.historic_layers.remove(&LayerRef(expected.clone())); + if relaced { for kr in expected.get_occupied_ranges(ctx)? { match self.historic.replace( &historic_layer_coverage::LayerKey { @@ -407,7 +408,6 @@ where "existing l0 delta layer was not found" ); } - true } else { for kr in new.get_occupied_ranges(ctx)? { self.historic.insert( @@ -419,8 +419,7 @@ where Arc::clone(&new), ); } - false - }; + } if expected_l0 { self.l0_delta_layers.insert(LayerRef(new.clone())); } From 6aab1a91ffcfa41a1f7be2ab7ae080d8f2991921 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 19:32:51 +0200 Subject: [PATCH 31/44] Minor refactoring --- pageserver/src/tenant/layer_map.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index c825b61df7a0..dc5463050195 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -365,7 +365,7 @@ where let lr = expected.get_lsn_range(); let replaced = self.historic_layers.remove(&LayerRef(expected.clone())); - if relaced { + if replaced { for kr in expected.get_occupied_ranges(ctx)? { match self.historic.replace( &historic_layer_coverage::LayerKey { From d89e5a751413111b1e78da9cdecdd1b9a345cbf4 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 22:00:33 +0200 Subject: [PATCH 32/44] Ignore load layer error in get_occupied_ranges --- .../src/tenant/storage_layer/delta_layer.rs | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 86fd3226a594..6715f7431351 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -338,20 +338,20 @@ impl Layer for DeltaLayer { } fn get_occupied_ranges(&self, ctx: &RequestContext) -> Result>> { - let inner = self.load(ctx)?; - if let Some(holes) = &inner.holes { - let mut occ = Vec::with_capacity(holes.len() + 1); - let key_range = self.get_key_range(); - let mut prev = key_range.start; - for hole in holes { - occ.push(prev..hole.0.start); - prev = hole.0.end; - } - occ.push(prev..key_range.end); - Ok(occ) - } else { - Ok(vec![self.get_key_range()]) - } + if let Ok(inner) = self.load(ctx) { + if let Some(holes) = &inner.holes { + let mut occ = Vec::with_capacity(holes.len() + 1); + let key_range = self.get_key_range(); + let mut prev = key_range.start; + for hole in holes { + occ.push(prev..hole.0.start); + prev = hole.0.end; + } + occ.push(prev..key_range.end); + return Ok(occ); + } + } + Ok(vec![self.get_key_range()]) } fn get_holes(&self, ctx: &RequestContext) -> Result>> { From be24282f469a430213602085ba440495775a782d Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 22:00:44 +0200 Subject: [PATCH 33/44] Ignore load layer error in get_occupied_ranges --- .../src/tenant/storage_layer/delta_layer.rs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 6715f7431351..627bc341062d 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -339,18 +339,18 @@ impl Layer for DeltaLayer { fn get_occupied_ranges(&self, ctx: &RequestContext) -> Result>> { if let Ok(inner) = self.load(ctx) { - if let Some(holes) = &inner.holes { - let mut occ = Vec::with_capacity(holes.len() + 1); - let key_range = self.get_key_range(); - let mut prev = key_range.start; - for hole in holes { - occ.push(prev..hole.0.start); - prev = hole.0.end; - } - occ.push(prev..key_range.end); - return Ok(occ); - } - } + if let Some(holes) = &inner.holes { + let mut occ = Vec::with_capacity(holes.len() + 1); + let key_range = self.get_key_range(); + let mut prev = key_range.start; + for hole in holes { + occ.push(prev..hole.0.start); + prev = hole.0.end; + } + occ.push(prev..key_range.end); + return Ok(occ); + } + } Ok(vec![self.get_key_range()]) } From 689ef9e728a1af111abcaad1dde0fd47f9556941 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 22:28:19 +0200 Subject: [PATCH 34/44] Sort layers in LayerMapInfo --- test_runner/fixtures/neon_fixtures.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b35252243edd..492620ee3dcc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1531,11 +1531,13 @@ def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo: assert isinstance(json_in_memory_layers, List) for json_in_memory_layer in json_in_memory_layers: info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer)) + info.in_memory_layers.sort() json_historic_layers = d["historic_layers"] assert isinstance(json_historic_layers, List) for json_historic_layer in json_historic_layers: info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer)) + info.historic_layers.sort() return info From e068e48224f6aebb45906f34e354e867ead2d43c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 4 Feb 2023 22:58:07 +0200 Subject: [PATCH 35/44] Sort layers in LayerMapInfo --- test_runner/fixtures/neon_fixtures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 492620ee3dcc..a0cd6996e83f 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1531,13 +1531,13 @@ def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo: assert isinstance(json_in_memory_layers, List) for json_in_memory_layer in json_in_memory_layers: info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer)) - info.in_memory_layers.sort() + info.in_memory_layers.sort(key=lambda info: info.lsn_start) json_historic_layers = d["historic_layers"] assert isinstance(json_historic_layers, List) for json_historic_layer in json_historic_layers: info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer)) - info.historic_layers.sort() + info.historic_layers.sort(key=lambda info: info.layer_file_name) return info From 8051f94ce3d2f70157505bb76b14e5bb16caf9c4 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 6 Feb 2023 21:28:03 +0200 Subject: [PATCH 36/44] Update pageserver/src/tenant/layer_map.rs Co-authored-by: Joonas Koivunen --- pageserver/src/tenant/layer_map.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index dc5463050195..cf2974b859ec 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -76,7 +76,7 @@ impl Eq for LayerRef {} impl Hash for LayerRef { fn hash(&self, state: &mut H) { - Arc::into_raw(Arc::clone(&self.0)).hash(state) + Arc::as_ptr(&self.0).hash(state) } } From 04c81fc4f33ea72a734b97254f22381fd3b36600 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 6 Feb 2023 22:06:36 +0200 Subject: [PATCH 37/44] Add test for format version 2 of index_part.json --- .../tenant/remote_timeline_client/index.rs | 28 +++++++++++++------ pageserver/src/tenant/storage_layer.rs | 2 +- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index e046d1a36c26..680960d1ab5e 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -214,7 +214,7 @@ impl IndexPart { /// used to understand later versions. /// /// Version is currently informative only. - const LATEST_VERSION: usize = 1; + const LATEST_VERSION: usize = 2; pub const FILE_NAME: &'static str = "index_part.json"; pub fn new( @@ -264,6 +264,7 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { #[cfg(test)] mod tests { use super::*; + use crate::repository::Key; #[test] fn v0_indexpart_is_parsed() { @@ -287,13 +288,22 @@ mod tests { } #[test] - fn v1_indexpart_is_parsed() { + fn v2_indexpart_is_parsed() { let example = r#"{ - "version":1, + "version": 2, "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], "layer_metadata":{ - "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, - "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { + "file_size": 25600000, + "holes": [{"start": "000000000000000000000000000000000000", "end": "010000000000000000000000000000000000"}] + }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { + "file_size": 9007199254741001, + "holes": [ + {"start": "010000000000000000000000000000000000", "end": "020000000000000000000000000000000001"}, + {"start": "030000000000000000000000000000000000", "end": "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"} + ] + } }, "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -301,18 +311,20 @@ mod tests { let expected = IndexPart { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? - version: 1, + version: 2, timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { file_size: Some(25600000), - holes: None, + holes: Some(vec![Hole(Key::from_hex("000000000000000000000000000000000000").unwrap()..Key::from_hex("010000000000000000000000000000000000").unwrap())]), }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: Some(9007199254741001), - holes: None, + holes: Some(vec![Hole(Key::from_hex("010000000000000000000000000000000000").unwrap()..Key::from_hex("020000000000000000000000000000000001").unwrap()), + Hole(Key::from_hex("030000000000000000000000000000000000").unwrap()..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap()), + ]), }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index e4f60e8d9b26..7ed3e3055202 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -396,7 +396,7 @@ pub struct LayerDescriptor { /// Wrapper for key range to provide reverse ordering by range length (for BinaryHeap) #[derive(Debug, Clone, PartialEq, Eq)] -pub struct Hole(Range); +pub struct Hole(pub Range); // Alter default serde serialization for Key class to reduce size of index_part.json file. // Instead of dumping Key as json object with six field? we just store hex string representing key (as in file name) From 4b3fe741ab98c773ed9c1f8f95856291965d487c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 6 Feb 2023 22:46:37 +0200 Subject: [PATCH 38/44] Try to use wait_for_upload in test_ondemand_download_timetravel --- test_runner/regress/test_ondemand_download.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 7e648ef03196..9a0bfd16b470 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -190,6 +190,11 @@ def test_ondemand_download_timetravel( # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info("uploads have finished") + + ##### Stop the first pageserver instance, erase all its data env.postgres.stop_all() @@ -213,9 +218,6 @@ def get_resident_physical_size(): log.info(filled_size) assert filled_current_physical == filled_size, "we don't yet do layer eviction" - # Wait until generated image layers are uploaded to S3 - time.sleep(3) - env.pageserver.stop() # remove all the layer files From ed44d66b791fd08964636aede7e4806d5b396a1d Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 7 Feb 2023 11:23:31 +0200 Subject: [PATCH 39/44] Restore v1_indexpart_is_parsed test --- .../tenant/remote_timeline_client/index.rs | 75 ++++++++++++++----- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 680960d1ab5e..088204b3e358 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -288,22 +288,13 @@ mod tests { } #[test] - fn v2_indexpart_is_parsed() { + fn v1_indexpart_is_parsed() { let example = r#"{ - "version": 2, + "version":1, "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], "layer_metadata":{ - "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { - "file_size": 25600000, - "holes": [{"start": "000000000000000000000000000000000000", "end": "010000000000000000000000000000000000"}] - }, - "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { - "file_size": 9007199254741001, - "holes": [ - {"start": "010000000000000000000000000000000000", "end": "020000000000000000000000000000000001"}, - {"start": "030000000000000000000000000000000000", "end": "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"} - ] - } + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -311,20 +302,18 @@ mod tests { let expected = IndexPart { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? - version: 2, + version: 1, timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { file_size: Some(25600000), - holes: Some(vec![Hole(Key::from_hex("000000000000000000000000000000000000").unwrap()..Key::from_hex("010000000000000000000000000000000000").unwrap())]), + holes: None, }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: Some(9007199254741001), - holes: Some(vec![Hole(Key::from_hex("010000000000000000000000000000000000").unwrap()..Key::from_hex("020000000000000000000000000000000001").unwrap()), - Hole(Key::from_hex("030000000000000000000000000000000000").unwrap()..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap()), - ]), + holes: None, }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), @@ -375,4 +364,54 @@ mod tests { let part = part.remove_unclean_layer_file_names(); assert_eq!(part, expected); } + + #[test] + fn v2_indexpart_is_parsed() { + let example = r#"{ + "version": 2, + "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { + "file_size": 25600000, + "holes": [{"start": "000000000000000000000000000000000000", "end": "010000000000000000000000000000000000"}] + }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { + "file_size": 9007199254741001, + "holes": [ + {"start": "010000000000000000000000000000000000", "end": "020000000000000000000000000000000001"}, + {"start": "030000000000000000000000000000000000", "end": "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"} + ] + } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + }"#; + + let expected = IndexPart { + // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? + version: 2, + timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + file_size: Some(25600000), + holes: Some(vec![Hole(Key::from_hex("000000000000000000000000000000000000").unwrap()..Key::from_hex("010000000000000000000000000000000000").unwrap())]), + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + // serde_json should always parse this but this might be a double with jq for + // example. + file_size: Some(9007199254741001), + holes: Some(vec![Hole(Key::from_hex("010000000000000000000000000000000000").unwrap()..Key::from_hex("020000000000000000000000000000000001").unwrap()), + Hole(Key::from_hex("030000000000000000000000000000000000").unwrap()..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap()), + ]), + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + }; + + let part = serde_json::from_str::(example) + .unwrap() + .remove_unclean_layer_file_names(); + assert_eq!(part, expected); + } } diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index f210ac524b42..e9c23004da28 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit f210ac524b42d2d6f404f8505c64de36e977d17c +Subproject commit e9c23004da2891cdfe3c1f108b19ca7d93846643 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 33f976345490..39a65d110298 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 33f976345490351f951d72f81621c2263c186c9a +Subproject commit 39a65d11029806797bf9453bdc516a3a543c612b From ad4d67883d6a2de3e82ff3b320cecd1c88b9febc Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 7 Feb 2023 15:25:59 +0200 Subject: [PATCH 40/44] Merge with main --- libs/pageserver_api/src/models.rs | 1 + pageserver/src/tenant/storage_layer.rs | 5 ++--- .../src/tenant/storage_layer/delta_layer.rs | 6 +++--- .../src/tenant/storage_layer/remote_layer.rs | 1 - pageserver/src/tenant/timeline.rs | 18 +++++++++++++++++- 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 9cdcf3a17391..48657f9dbc02 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -241,6 +241,7 @@ pub struct LayerMapInfo { #[repr(usize)] pub enum LayerAccessKind { GetValueReconstructData, + ExtractHoles, Iter, KeyIter, Dump, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 7ed3e3055202..bff40b3c8413 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -15,11 +15,10 @@ use anyhow::Result; use bytes::Bytes; use enum_map::EnumMap; use enumset::EnumSet; -use pageserver_api::models::LayerAccessKind; use pageserver_api::models::{ - HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, + HistoricLayerInfo, LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, + LayerResidenceStatus, }; -use pageserver_api::models::HistoricLayerInfo; use serde::ser::SerializeMap; use serde::{de::MapAccess, de::Visitor, Deserialize, Deserializer, Serialize, Serializer}; use std::fmt; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 627bc341062d..37f6dc508f33 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -338,7 +338,7 @@ impl Layer for DeltaLayer { } fn get_occupied_ranges(&self, ctx: &RequestContext) -> Result>> { - if let Ok(inner) = self.load(ctx) { + if let Ok(inner) = self.load(LayerAccessKind::ExtractHoles, ctx) { if let Some(holes) = &inner.holes { let mut occ = Vec::with_capacity(holes.len() + 1); let key_range = self.get_key_range(); @@ -355,7 +355,7 @@ impl Layer for DeltaLayer { } fn get_holes(&self, ctx: &RequestContext) -> Result>> { - let inner = self.load(ctx)?; + let inner = self.load(LayerAccessKind::ExtractHoles, ctx)?; Ok(inner.holes.clone()) } @@ -363,7 +363,7 @@ impl Layer for DeltaLayer { if !range_overlaps(&self.key_range, key_range) { Ok(false) } else { - let inner = self.load(ctx)?; + let inner = self.load(LayerAccessKind::ExtractHoles, ctx)?; if let Some(holes) = &inner.holes { let start = match holes.binary_search_by_key(&key_range.start, |hole| hole.0.start) { diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index ef911aa22269..bfd9b9b2e5bc 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -293,7 +293,6 @@ impl RemoteLayer { file_size, self.access_stats .clone_for_residence_change(LayerResidenceStatus::Resident), - self.get_holes().unwrap(), self.get_holes(ctx).unwrap(), )) } else { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c60c8d9dcdec..545f109dd511 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -867,9 +867,25 @@ impl Timeline { Ok(Some(true)) } +<<<<<<< HEAD /// Like [`evict_layer_batch`], but for just one layer. /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`. pub async fn evict_layer(&self, layer_file_name: &str, ctx: &RequestContext) -> anyhow::Result> { +======= + /// Evicts one layer as in replaces a downloaded layer with a remote layer + /// + /// Returns: + /// - `Ok(Some(true))` when the layer was replaced + /// - `Ok(Some(false))` when the layer was found, but no changes were made + /// - evictee was not yet downloaded + /// - layermap replacement failed + /// - `Ok(None)` when the layer is not found + pub async fn evict_layer( + &self, + layer_file_name: &str, + ctx: &RequestContext, + ) -> anyhow::Result> { +>>>>>>> da3955c6 (Merge with main) let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) }; let remote_client = self .remote_client @@ -997,7 +1013,7 @@ impl Timeline { if let Some(layer_size) = layer_size { self.metrics.resident_physical_size_gauge.sub(layer_size); } - } else { + } else { debug!(evicted=?local_layer, "layer was no longer in layer map"); } Ok(replaced) From 284d1679564dd66718eaf7fdb88f2e7336cc62d0 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 7 Feb 2023 17:55:34 +0200 Subject: [PATCH 41/44] Restore sleep in test_ondemand_download.py --- pageserver/src/tenant/timeline.rs | 16 ---------------- test_runner/regress/test_ondemand_download.py | 8 +++----- 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 545f109dd511..ef43224f46f0 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -867,25 +867,9 @@ impl Timeline { Ok(Some(true)) } -<<<<<<< HEAD /// Like [`evict_layer_batch`], but for just one layer. /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`. pub async fn evict_layer(&self, layer_file_name: &str, ctx: &RequestContext) -> anyhow::Result> { -======= - /// Evicts one layer as in replaces a downloaded layer with a remote layer - /// - /// Returns: - /// - `Ok(Some(true))` when the layer was replaced - /// - `Ok(Some(false))` when the layer was found, but no changes were made - /// - evictee was not yet downloaded - /// - layermap replacement failed - /// - `Ok(None)` when the layer is not found - pub async fn evict_layer( - &self, - layer_file_name: &str, - ctx: &RequestContext, - ) -> anyhow::Result> { ->>>>>>> da3955c6 (Merge with main) let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) }; let remote_client = self .remote_client diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 9a0bfd16b470..7e648ef03196 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -190,11 +190,6 @@ def test_ondemand_download_timetravel( # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) - # wait until pageserver successfully uploaded a checkpoint to remote storage - wait_for_upload(client, tenant_id, timeline_id, current_lsn) - log.info("uploads have finished") - - ##### Stop the first pageserver instance, erase all its data env.postgres.stop_all() @@ -218,6 +213,9 @@ def get_resident_physical_size(): log.info(filled_size) assert filled_current_physical == filled_size, "we don't yet do layer eviction" + # Wait until generated image layers are uploaded to S3 + time.sleep(3) + env.pageserver.stop() # remove all the layer files From e9b0e4379c761e876c0411985942f609e1c8a8e0 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 9 Feb 2023 09:13:19 +0200 Subject: [PATCH 42/44] Update pageserver/src/tenant/layer_map.rs Co-authored-by: bojanserafimov --- pageserver/src/tenant/layer_map.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index cf2974b859ec..c7c87d156749 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -794,7 +794,7 @@ where /// Return all L0 delta layers pub fn get_level0_deltas(&self) -> Result>> { - Ok(self.l0_delta_layers.iter().map(|r| r.0.clone()).collect()) + Ok(self.l0_delta_layers.keys().cloned().collect()) } /// debugging function to print out the contents of the layer map From 6e5efc5dc11aa03d74afa97b36b3ea1e1dfc9025 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 9 Feb 2023 10:59:44 +0200 Subject: [PATCH 43/44] Treate Arc as raw pointers in hash implementation for LayerRef --- pageserver/src/tenant/layer_map.rs | 60 +++++++++++++++--------------- pageserver/src/tenant/timeline.rs | 22 +++++++---- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index c7c87d156749..d77659116020 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -64,11 +64,33 @@ pub use historic_layer_coverage::Replacement; use super::storage_layer::range_eq; +pub fn compare_arced_layers(left: &Arc, right: &Arc) -> bool { + // "dyn Trait" objects are "fat pointers" in that they have two components: + // - pointer to the object + // - pointer to the vtable + // + // rust does not provide a guarantee that these vtables are unique, but however + // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the + // pointer and the vtable need to be equal. + // + // See: https://github.com/rust-lang/rust/issues/103763 + // + // A future version of rust will most likely use this form below, where we cast each + // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it + // not affect the comparison. + // + // See: https://github.com/rust-lang/rust/pull/106450 + let left = Arc::as_ptr(left) as *const (); + let right = Arc::as_ptr(right) as *const (); + + left == right +} + struct LayerRef(Arc); impl PartialEq for LayerRef { fn eq(&self, other: &LayerRef) -> bool { - Arc::ptr_eq(&self.0, &other.0) + compare_arced_layers(&self.0, &other.0) } } @@ -76,7 +98,8 @@ impl Eq for LayerRef {} impl Hash for LayerRef { fn hash(&self, state: &mut H) { - Arc::as_ptr(&self.0).hash(state) + let ptr = Arc::as_ptr(&self.0) as *const (); + ptr.hash(state) } } @@ -374,7 +397,7 @@ where is_image: !expected.is_incremental(), }, new.clone(), - |existing| Self::compare_arced_layers(existing, expected), + |existing| compare_arced_layers(existing, expected), ) { Replacement::Replaced { .. } => { /* expected */ } Replacement::NotFound => { @@ -794,7 +817,7 @@ where /// Return all L0 delta layers pub fn get_level0_deltas(&self) -> Result>> { - Ok(self.l0_delta_layers.keys().cloned().collect()) + Ok(self.l0_delta_layers.iter().map(|r| r.0.clone()).collect()) } /// debugging function to print out the contents of the layer map @@ -819,29 +842,6 @@ where println!("End dump LayerMap"); Ok(()) } - - #[inline(always)] - fn compare_arced_layers(left: &Arc, right: &Arc) -> bool { - // "dyn Trait" objects are "fat pointers" in that they have two components: - // - pointer to the object - // - pointer to the vtable - // - // rust does not provide a guarantee that these vtables are unique, but however - // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the - // pointer and the vtable need to be equal. - // - // See: https://github.com/rust-lang/rust/issues/103763 - // - // A future version of rust will most likely use this form below, where we cast each - // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it - // not affect the comparison. - // - // See: https://github.com/rust-lang/rust/pull/106450 - let left = Arc::as_ptr(left) as *const (); - let right = Arc::as_ptr(right) as *const (); - - left == right - } } #[cfg(test)] @@ -897,7 +897,7 @@ mod tests { let mut map = LayerMap::default(); // two disjoint Arcs in different lifecycle phases. - assert!(!LayerMap::compare_arced_layers(&remote, &downloaded)); + assert!(!compare_arced_layers(&remote, &downloaded)); let expected_in_counts = (1, usize::from(expected_l0)); @@ -921,14 +921,14 @@ mod tests { fn count_layer_in(map: &LayerMap, layer: &Arc) -> (usize, usize) { let historic = map .iter_historic_layers() - .filter(|x| LayerMap::compare_arced_layers(x, layer)) + .filter(|x| compare_arced_layers(x, layer)) .count(); let l0s = map .get_level0_deltas() .expect("why does this return a result"); let l0 = l0s .iter() - .filter(|x| LayerMap::compare_arced_layers(x, layer)) + .filter(|x| compare_arced_layers(x, layer)) .count(); (historic, l0) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ef43224f46f0..24e1ab48d882 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -869,7 +869,11 @@ impl Timeline { /// Like [`evict_layer_batch`], but for just one layer. /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`. - pub async fn evict_layer(&self, layer_file_name: &str, ctx: &RequestContext) -> anyhow::Result> { + pub async fn evict_layer( + &self, + layer_file_name: &str, + ctx: &RequestContext, + ) -> anyhow::Result> { let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) }; let remote_client = self .remote_client @@ -878,7 +882,7 @@ impl Timeline { let cancel = CancellationToken::new(); let results = self - .evict_layer_batch(remote_client, &[local_layer], cancel) + .evict_layer_batch(remote_client, &[local_layer], cancel, ctx) .await?; assert_eq!(results.len(), 1); let result: Option> = results.into_iter().next().unwrap(); @@ -912,6 +916,7 @@ impl Timeline { remote_client: &Arc, layers_to_evict: &[Arc], cancel: CancellationToken, + ctx: &RequestContext, ) -> anyhow::Result>>> { // ensure that the layers have finished uploading // (don't hold the layer_removal_cs while we do it, we're not removing anything yet) @@ -933,7 +938,7 @@ impl Timeline { let res = if cancel.is_cancelled() { None } else { - Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates)) + Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates, ctx)) }; results.push(res); } @@ -952,9 +957,8 @@ impl Timeline { _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, local_layer: &Arc, batch_updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, + ctx: &RequestContext, ) -> anyhow::Result { - use super::layer_map::Replacement; - if local_layer.is_remote_layer() { return Ok(false); } @@ -986,8 +990,8 @@ impl Timeline { ), }); - let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer, &ctx)?; - if replaced { + let replaced = batch_updates.replace_historic(local_layer, new_remote_layer, &ctx)?; + if replaced { let layer_size = local_layer.file_size(); if let Err(e) = local_layer.delete() { @@ -2479,6 +2483,10 @@ impl Timeline { self.conf.timeline_path(&self.timeline_id, &self.tenant_id), ])?; + // + // This call force extraction of hole info from new delta layer so + // there is no need to perform this expensive operation under layer map write lock below + // let holes = new_delta.get_holes(ctx)?; // Add it to the layer map From d78fbb0bc59b6df6292a482ab389118d3bebfb30 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 9 Feb 2023 14:52:54 +0200 Subject: [PATCH 44/44] Mak eclippy happy --- pageserver/src/tenant/layer_map.rs | 1 + pageserver/src/tenant/timeline.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index d77659116020..fe7101885c2f 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -849,6 +849,7 @@ mod tests { use super::LayerMap; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::TaskKind; + use crate::tenant::layer_map::compare_arced_layers; use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName}; use std::str::FromStr; use std::sync::Arc; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 24e1ab48d882..651522525701 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -990,7 +990,7 @@ impl Timeline { ), }); - let replaced = batch_updates.replace_historic(local_layer, new_remote_layer, &ctx)?; + let replaced = batch_updates.replace_historic(local_layer, new_remote_layer, ctx)?; if replaced { let layer_size = local_layer.file_size();