Skip to content
Merged
4 changes: 4 additions & 0 deletions datafusion/core/src/execution/context/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1068,6 +1068,10 @@ impl SessionContext {
builder.with_max_temp_directory_size(directory_size as u64)
}
"temp_directory" => builder.with_temp_file_path(value),
"metadata_cache_limit" => {
let limit = Self::parse_memory_limit(value)?;
builder.with_metadata_cache_limit(limit)
}
_ => {
return Err(DataFusionError::Plan(format!(
"Unknown runtime configuration: {variable}"
Expand Down
33 changes: 33 additions & 0 deletions datafusion/core/tests/sql/runtime_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,39 @@ async fn test_max_temp_directory_size_enforcement() {
);
}

#[tokio::test]
async fn test_test_metadata_cache_limit() {
let ctx = SessionContext::new();

let update_limit = async |ctx: &SessionContext, limit: &str| {
ctx.sql(
format!("SET datafusion.runtime.metadata_cache_limit = '{limit}'").as_str(),
)
.await
.unwrap()
.collect()
.await
.unwrap();
};

let get_limit = |ctx: &SessionContext| -> usize {
ctx.task_ctx()
.runtime_env()
.cache_manager
.get_file_metadata_cache()
.cache_limit()
};

update_limit(&ctx, "100M").await;
assert_eq!(get_limit(&ctx), 100 * 1024 * 1024);

update_limit(&ctx, "2G").await;
assert_eq!(get_limit(&ctx), 2 * 1024 * 1024 * 1024);

update_limit(&ctx, "123K").await;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice

assert_eq!(get_limit(&ctx), 123 * 1024);
}

#[tokio::test]
async fn test_unknown_runtime_config() {
let ctx = SessionContext::new();
Expand Down
19 changes: 8 additions & 11 deletions datafusion/datasource-parquet/src/file_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -449,17 +449,14 @@ impl FileFormat for ParquetFormat {

// Use the CachedParquetFileReaderFactory when metadata caching is enabled
if self.options.global.cache_metadata {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are your thoughts about (in a follow on PR) removing the options.cache_metadata and always trying to save the metadata (which will be a noop if there is no room)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think caching by default would be good. The only situation where it wouldn't help would be one-time scans of parquet files that do not require the page index, but for large files the scan should largely outweigh the page index retrieval anyway.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if let Some(metadata_cache) =
state.runtime_env().cache_manager.get_file_metadata_cache()
{
let store = state
.runtime_env()
.object_store(conf.object_store_url.clone())?;
let cached_parquet_read_factory =
Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache));
source =
source.with_parquet_file_reader_factory(cached_parquet_read_factory);
}
let metadata_cache =
state.runtime_env().cache_manager.get_file_metadata_cache();
let store = state
.runtime_env()
.object_store(conf.object_store_url.clone())?;
let cached_parquet_read_factory =
Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache));
source = source.with_parquet_file_reader_factory(cached_parquet_read_factory);
}

if let Some(metadata_size_hint) = metadata_size_hint {
Expand Down
4 changes: 4 additions & 0 deletions datafusion/datasource-parquet/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,4 +288,8 @@ impl FileMetadata for CachedParquetMetaData {
fn as_any(&self) -> &dyn Any {
self
}

fn memory_size(&self) -> usize {
self.0.memory_size()
}
}
79 changes: 59 additions & 20 deletions datafusion/execution/src/cache/cache_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,20 @@ pub trait FileMetadata: Any + Send + Sync {
/// Returns the file metadata as [`Any`] so that it can be downcasted to a specific
/// implementation.
fn as_any(&self) -> &dyn Any;

/// Returns the size of the metadata in bytes.
fn memory_size(&self) -> usize;
}

/// Cache to store file-embedded metadata.
pub trait FileMetadataCache:
CacheAccessor<ObjectMeta, Arc<dyn FileMetadata>, Extra = ObjectMeta>
{
// Returns the cache's memory limit in bytes.
fn cache_limit(&self) -> usize;

// Updates the cache with a new memory limit in bytes.
fn update_cache_limit(&self, limit: usize);
}

impl Debug for dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta> {
Expand All @@ -65,30 +73,36 @@ impl Debug for dyn FileMetadataCache {
}
}

#[derive(Default, Debug)]
#[derive(Debug)]
pub struct CacheManager {
file_statistic_cache: Option<FileStatisticsCache>,
list_files_cache: Option<ListFilesCache>,
file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
file_metadata_cache: Arc<dyn FileMetadataCache>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seeing the idea of having a default file_metadata_cache installed got me thinking about @BlakeOrth's comment here: #16971 (comment)

After this work to cache file metadata, it seems like we may want to consider adding default caches for ListFiles and FileStatistics as well 🤔 (as a follow on PR of course)

}

impl CacheManager {
pub fn try_new(config: &CacheManagerConfig) -> Result<Arc<Self>> {
let mut manager = CacheManager::default();
if let Some(cc) = &config.table_files_statistics_cache {
manager.file_statistic_cache = Some(Arc::clone(cc))
}
if let Some(lc) = &config.list_files_cache {
manager.list_files_cache = Some(Arc::clone(lc))
}
if let Some(mc) = &config.file_metadata_cache {
manager.file_metadata_cache = Some(Arc::clone(mc));
} else {
manager.file_metadata_cache =
Some(Arc::new(DefaultFilesMetadataCache::default()));
}

Ok(Arc::new(manager))
let file_statistic_cache =
config.table_files_statistics_cache.as_ref().map(Arc::clone);

let list_files_cache = config.list_files_cache.as_ref().map(Arc::clone);

let file_metadata_cache = config
.file_metadata_cache
.as_ref()
.map(Arc::clone)
.unwrap_or_else(|| {
Arc::new(DefaultFilesMetadataCache::new(config.metadata_cache_limit))
});

// the cache memory limit might have changed, ensure the limit is updated
file_metadata_cache.update_cache_limit(config.metadata_cache_limit);

Ok(Arc::new(CacheManager {
file_statistic_cache,
list_files_cache,
file_metadata_cache,
}))
}

/// Get the cache of listing files statistics.
Expand All @@ -102,12 +116,19 @@ impl CacheManager {
}

/// Get the file embedded metadata cache.
pub fn get_file_metadata_cache(&self) -> Option<Arc<dyn FileMetadataCache>> {
self.file_metadata_cache.clone()
pub fn get_file_metadata_cache(&self) -> Arc<dyn FileMetadataCache> {
Arc::clone(&self.file_metadata_cache)
}

/// Get the limit of the file embedded metadata cache.
pub fn get_metadata_cache_limit(&self) -> usize {
self.file_metadata_cache.cache_limit()
}
}

#[derive(Clone, Default)]
const DEFAULT_METADATA_CACHE_LIMIT: usize = 50 * 1024 * 1024; // 50M

#[derive(Clone)]
pub struct CacheManagerConfig {
/// Enable cache of files statistics when listing files.
/// Avoid get same file statistics repeatedly in same datafusion session.
Expand All @@ -124,6 +145,19 @@ pub struct CacheManagerConfig {
/// data file (e.g., Parquet footer and page metadata).
/// If not provided, the [`CacheManager`] will create a [`DefaultFilesMetadataCache`].
pub file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
/// Limit of the file-embedded metadata cache, in bytes.
pub metadata_cache_limit: usize,
}

impl Default for CacheManagerConfig {
fn default() -> Self {
Self {
table_files_statistics_cache: Default::default(),
list_files_cache: Default::default(),
file_metadata_cache: Default::default(),
metadata_cache_limit: DEFAULT_METADATA_CACHE_LIMIT,
}
}
}

impl CacheManagerConfig {
Expand All @@ -147,4 +181,9 @@ impl CacheManagerConfig {
self.file_metadata_cache = cache;
self
}

pub fn with_metadata_cache_limit(mut self, limit: usize) -> Self {
self.metadata_cache_limit = limit;
self
}
}
Loading