-
Notifications
You must be signed in to change notification settings - Fork 1.7k
feat: Limit the memory used in the file metadata cache #17031
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b95b443
3962820
f6688e5
dab1d27
269e715
a80c49b
ec8060a
efdb5e3
a10f734
e3c83eb
31ed66f
dca5636
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -449,17 +449,14 @@ impl FileFormat for ParquetFormat { | |
|
||
// Use the CachedParquetFileReaderFactory when metadata caching is enabled | ||
if self.options.global.cache_metadata { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are your thoughts about (in a follow on PR) removing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think caching by default would be good. The only situation where it wouldn't help would be one-time scans of parquet files that do not require the page index, but for large files the scan should largely outweigh the page index retrieval anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I filed a ticket to track |
||
if let Some(metadata_cache) = | ||
state.runtime_env().cache_manager.get_file_metadata_cache() | ||
{ | ||
let store = state | ||
.runtime_env() | ||
.object_store(conf.object_store_url.clone())?; | ||
let cached_parquet_read_factory = | ||
Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache)); | ||
source = | ||
source.with_parquet_file_reader_factory(cached_parquet_read_factory); | ||
} | ||
let metadata_cache = | ||
state.runtime_env().cache_manager.get_file_metadata_cache(); | ||
let store = state | ||
.runtime_env() | ||
.object_store(conf.object_store_url.clone())?; | ||
let cached_parquet_read_factory = | ||
Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache)); | ||
source = source.with_parquet_file_reader_factory(cached_parquet_read_factory); | ||
} | ||
|
||
if let Some(metadata_size_hint) = metadata_size_hint { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,12 +39,20 @@ pub trait FileMetadata: Any + Send + Sync { | |
/// Returns the file metadata as [`Any`] so that it can be downcasted to a specific | ||
/// implementation. | ||
fn as_any(&self) -> &dyn Any; | ||
|
||
/// Returns the size of the metadata in bytes. | ||
fn memory_size(&self) -> usize; | ||
} | ||
|
||
/// Cache to store file-embedded metadata. | ||
pub trait FileMetadataCache: | ||
CacheAccessor<ObjectMeta, Arc<dyn FileMetadata>, Extra = ObjectMeta> | ||
{ | ||
// Returns the cache's memory limit in bytes. | ||
fn cache_limit(&self) -> usize; | ||
|
||
// Updates the cache with a new memory limit in bytes. | ||
fn update_cache_limit(&self, limit: usize); | ||
} | ||
|
||
impl Debug for dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta> { | ||
|
@@ -65,30 +73,36 @@ impl Debug for dyn FileMetadataCache { | |
} | ||
} | ||
|
||
#[derive(Default, Debug)] | ||
#[derive(Debug)] | ||
pub struct CacheManager { | ||
file_statistic_cache: Option<FileStatisticsCache>, | ||
list_files_cache: Option<ListFilesCache>, | ||
file_metadata_cache: Option<Arc<dyn FileMetadataCache>>, | ||
file_metadata_cache: Arc<dyn FileMetadataCache>, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seeing the idea of having a default file_metadata_cache installed got me thinking about @BlakeOrth's comment here: #16971 (comment) After this work to cache file metadata, it seems like we may want to consider adding default caches for ListFiles and FileStatistics as well 🤔 (as a follow on PR of course) |
||
} | ||
|
||
impl CacheManager { | ||
pub fn try_new(config: &CacheManagerConfig) -> Result<Arc<Self>> { | ||
let mut manager = CacheManager::default(); | ||
if let Some(cc) = &config.table_files_statistics_cache { | ||
manager.file_statistic_cache = Some(Arc::clone(cc)) | ||
} | ||
if let Some(lc) = &config.list_files_cache { | ||
manager.list_files_cache = Some(Arc::clone(lc)) | ||
} | ||
if let Some(mc) = &config.file_metadata_cache { | ||
manager.file_metadata_cache = Some(Arc::clone(mc)); | ||
} else { | ||
manager.file_metadata_cache = | ||
Some(Arc::new(DefaultFilesMetadataCache::default())); | ||
} | ||
|
||
Ok(Arc::new(manager)) | ||
let file_statistic_cache = | ||
config.table_files_statistics_cache.as_ref().map(Arc::clone); | ||
|
||
let list_files_cache = config.list_files_cache.as_ref().map(Arc::clone); | ||
|
||
let file_metadata_cache = config | ||
.file_metadata_cache | ||
.as_ref() | ||
.map(Arc::clone) | ||
.unwrap_or_else(|| { | ||
Arc::new(DefaultFilesMetadataCache::new(config.metadata_cache_limit)) | ||
}); | ||
|
||
// the cache memory limit might have changed, ensure the limit is updated | ||
file_metadata_cache.update_cache_limit(config.metadata_cache_limit); | ||
|
||
Ok(Arc::new(CacheManager { | ||
file_statistic_cache, | ||
list_files_cache, | ||
file_metadata_cache, | ||
})) | ||
} | ||
|
||
/// Get the cache of listing files statistics. | ||
|
@@ -102,12 +116,19 @@ impl CacheManager { | |
} | ||
|
||
/// Get the file embedded metadata cache. | ||
pub fn get_file_metadata_cache(&self) -> Option<Arc<dyn FileMetadataCache>> { | ||
self.file_metadata_cache.clone() | ||
pub fn get_file_metadata_cache(&self) -> Arc<dyn FileMetadataCache> { | ||
Arc::clone(&self.file_metadata_cache) | ||
} | ||
|
||
/// Get the limit of the file embedded metadata cache. | ||
pub fn get_metadata_cache_limit(&self) -> usize { | ||
self.file_metadata_cache.cache_limit() | ||
} | ||
} | ||
|
||
#[derive(Clone, Default)] | ||
const DEFAULT_METADATA_CACHE_LIMIT: usize = 50 * 1024 * 1024; // 50M | ||
|
||
#[derive(Clone)] | ||
pub struct CacheManagerConfig { | ||
/// Enable cache of files statistics when listing files. | ||
/// Avoid get same file statistics repeatedly in same datafusion session. | ||
|
@@ -124,6 +145,19 @@ pub struct CacheManagerConfig { | |
/// data file (e.g., Parquet footer and page metadata). | ||
/// If not provided, the [`CacheManager`] will create a [`DefaultFilesMetadataCache`]. | ||
pub file_metadata_cache: Option<Arc<dyn FileMetadataCache>>, | ||
/// Limit of the file-embedded metadata cache, in bytes. | ||
pub metadata_cache_limit: usize, | ||
} | ||
|
||
impl Default for CacheManagerConfig { | ||
fn default() -> Self { | ||
Self { | ||
table_files_statistics_cache: Default::default(), | ||
list_files_cache: Default::default(), | ||
file_metadata_cache: Default::default(), | ||
metadata_cache_limit: DEFAULT_METADATA_CACHE_LIMIT, | ||
} | ||
} | ||
} | ||
|
||
impl CacheManagerConfig { | ||
|
@@ -147,4 +181,9 @@ impl CacheManagerConfig { | |
self.file_metadata_cache = cache; | ||
self | ||
} | ||
|
||
pub fn with_metadata_cache_limit(mut self, limit: usize) -> Self { | ||
self.metadata_cache_limit = limit; | ||
self | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice