@@ -25,19 +25,34 @@ use std::collections::HashMap;
2525use std:: fmt:: { Debug , Formatter } ;
2626use std:: sync:: Arc ;
2727
28- /// The cache of listing files statistics.
29- /// if set [`CacheManagerConfig::with_files_statistics_cache`]
30- /// Will avoid infer same file statistics repeatedly during the session lifetime,
31- /// this cache will store in [`crate::runtime_env::RuntimeEnv`].
28+ /// A cache for [`Statistics`].
29+ ///
30+ /// If enabled via [`CacheManagerConfig::with_files_statistics_cache`] this
31+ /// cache avoids inferring the same file statistics repeatedly during the
32+ /// session lifetime.
33+ ///
34+ /// See [`crate::runtime_env::RuntimeEnv`] for more details
3235pub type FileStatisticsCache =
3336 Arc < dyn CacheAccessor < Path , Arc < Statistics > , Extra = ObjectMeta > > ;
3437
38+ /// Cache for storing the [`ObjectMeta`]s that result from listing a path
39+ ///
40+ /// Listing a path means doing an object store "list" operation or `ls`
41+ /// command on the local filesystem. This operation can be expensive,
42+ /// especially when done over remote object stores.
43+ ///
44+ /// See [`crate::runtime_env::RuntimeEnv`] for more details
3545pub type ListFilesCache =
3646 Arc < dyn CacheAccessor < Path , Arc < Vec < ObjectMeta > > , Extra = ObjectMeta > > ;
3747
38- /// Represents generic file-embedded metadata.
48+ /// Generic file-embedded metadata used with [`FileMetadataCache`].
49+ ///
50+ /// For example, Parquet footers and page metadata can be represented
51+ /// using this trait.
52+ ///
53+ /// See [`crate::runtime_env::RuntimeEnv`] for more details
3954pub trait FileMetadata : Any + Send + Sync {
40- /// Returns the file metadata as [`Any`] so that it can be downcasted to a specific
55+ /// Returns the file metadata as [`Any`] so that it can be downcast to a specific
4156 /// implementation.
4257 fn as_any ( & self ) -> & dyn Any ;
4358
@@ -48,7 +63,20 @@ pub trait FileMetadata: Any + Send + Sync {
4863 fn extra_info ( & self ) -> HashMap < String , String > ;
4964}
5065
51- /// Cache to store file-embedded metadata.
66+ /// Cache for file-embedded metadata.
67+ ///
68+ /// This cache stores per-file metadata in the form of [`FileMetadata`],
69+ ///
70+ /// For example, the built in [`ListingTable`] uses this cache to avoid parsing
71+ /// Parquet footers multiple times for the same file.
72+ ///
73+ /// DataFusion provides a default implementation, [`DefaultFilesMetadataCache`],
74+ /// and users can also provide their own implementations to implement custom
75+ /// caching strategies.
76+ ///
77+ /// See [`crate::runtime_env::RuntimeEnv`] for more details.
78+ ///
79+ /// [`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
5280pub trait FileMetadataCache :
5381 CacheAccessor < ObjectMeta , Arc < dyn FileMetadata > , Extra = ObjectMeta >
5482{
@@ -93,6 +121,13 @@ impl Debug for dyn FileMetadataCache {
93121 }
94122}
95123
124+ /// Manages various caches used in DataFusion.
125+ ///
126+ /// Following DataFusion design principles, DataFusion provides default cache
127+ /// implementations, while also allowing users to provide their own custom cache
128+ /// implementations by implementing the relevant traits.
129+ ///
130+ /// See [`CacheManagerConfig`] for configuration options.
96131#[ derive( Debug ) ]
97132pub struct CacheManager {
98133 file_statistic_cache : Option < FileStatisticsCache > ,
@@ -130,7 +165,7 @@ impl CacheManager {
130165 self . file_statistic_cache . clone ( )
131166 }
132167
133- /// Get the cache of objectMeta under same path.
168+ /// Get the cache for storing the result of listing [`ObjectMeta`]s under the same path.
134169 pub fn get_list_files_cache ( & self ) -> Option < ListFilesCache > {
135170 self . list_files_cache . clone ( )
136171 }
@@ -181,6 +216,9 @@ impl Default for CacheManagerConfig {
181216}
182217
183218impl CacheManagerConfig {
219+ /// Set the cache for files statistics.
220+ ///
221+ /// Default is `None` (disabled).
184222 pub fn with_files_statistics_cache (
185223 mut self ,
186224 cache : Option < FileStatisticsCache > ,
@@ -189,11 +227,17 @@ impl CacheManagerConfig {
189227 self
190228 }
191229
230+ /// Set the cache for listing files.
231+ ///
232+ /// Default is `None` (disabled).
192233 pub fn with_list_files_cache ( mut self , cache : Option < ListFilesCache > ) -> Self {
193234 self . list_files_cache = cache;
194235 self
195236 }
196237
238+ /// Sets the cache for file-embedded metadata.
239+ ///
240+ /// Default is a [`DefaultFilesMetadataCache`].
197241 pub fn with_file_metadata_cache (
198242 mut self ,
199243 cache : Option < Arc < dyn FileMetadataCache > > ,
@@ -202,6 +246,7 @@ impl CacheManagerConfig {
202246 self
203247 }
204248
249+ /// Sets the limit of the file-embedded metadata cache, in bytes.
205250 pub fn with_metadata_cache_limit ( mut self , limit : usize ) -> Self {
206251 self . metadata_cache_limit = limit;
207252 self
0 commit comments