diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 8858413bf5df..f85492d8c2ed 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -212,10 +212,7 @@ pub struct ListingOptions { /// The file format pub format: Arc, /// The expected partition column names in the folder structure. - /// For example `Vec["a", "b"]` means that the two first levels of - /// partitioning expected should be named "a" and "b": - /// - If there is a third level of partitioning it will be ignored. - /// - Files that don't follow this partitioning will be ignored. + /// See [Self::with_table_partition_cols] for details pub table_partition_cols: Vec<(String, DataType)>, /// Set true to try to guess statistics from the files. /// This can add a lot of overhead as it will usually require files @@ -297,9 +294,45 @@ impl ListingOptions { self } - /// Set table partition column names on [`ListingOptions`] and returns self. + /// Set `table partition columns` on [`ListingOptions`] and returns self. /// - /// You may use [`wrap_partition_type_in_dict`] to request a dictionary-encoded type. + /// "partition columns," used to support [Hive Partitioning], are + /// columns added to the data that is read, based on the folder + /// structure where the data resides. + /// + /// For example, give the following files in your filesystem: + /// + /// ```text + /// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet + /// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet + /// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet + /// ``` + /// + /// A [`ListingTable`] created at `/mnt/nyctaxi/` with partition + /// columns "year" and "month" will include new `year` and `month` + /// columns while reading the files. The `year` column would have + /// value `2022` and the `month` column would have value `01` for + /// the rows read from + /// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet` + /// + ///# Notes + /// + /// - If only one level (e.g. `year` in the example above) is + /// specified, the other levels are ignored but the files are + /// still read. + /// + /// - Files that don't follow this partitioning scheme will be + /// ignored. + /// + /// - Since the columns have the same value for all rows read from + /// each individual file (such as dates), they are typically + /// dictionary encoded for efficiency. You may use + /// [`wrap_partition_type_in_dict`] to request a + /// dictionary-encoded type. + /// + /// - The partition columns are solely extracted from the file path. Especially they are NOT part of the parquet files itself. + /// + /// # Example /// /// ``` /// # use std::sync::Arc; @@ -307,6 +340,8 @@ impl ListingOptions { /// # use datafusion::prelude::col; /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat}; /// + /// // listing options for files with paths such as `/mnt/data/col_a=x/col_b=y/data.parquet` + /// // `col_a` and `col_b` will be included in the data read from those files /// let listing_options = ListingOptions::new(Arc::new( /// ParquetFormat::default() /// )) @@ -317,7 +352,7 @@ impl ListingOptions { /// ("col_b".to_string(), DataType::Utf8)]); /// ``` /// - /// + /// [Hive Partitioning]: https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html /// [`wrap_partition_type_in_dict`]: crate::physical_plan::file_format::wrap_partition_type_in_dict pub fn with_table_partition_cols( mut self, diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs index eb70d18ef028..d4ef60a41588 100644 --- a/datafusion/core/src/physical_plan/file_format/mod.rs +++ b/datafusion/core/src/physical_plan/file_format/mod.rs @@ -69,22 +69,23 @@ use std::{ use super::{ColumnStatistics, Statistics}; -/// Convert logical type of partition column to physical type: `Dictionary(UInt16, val_type)`. +/// Convert type to a type suitable for use as a [`ListingTable`] +/// partition column. Returns `Dictionary(UInt16, val_type)`, which is +/// a reasonable trade off between a reasonable number of partition +/// values and space efficiency. /// -/// You CAN use this to specify types for partition columns. However you MAY also choose not to dictionary-encode the -/// data or to use a different dictionary type. +/// This use this to specify types for partition columns. However +/// you MAY also choose not to dictionary-encode the data or to use a +/// different dictionary type. /// -/// Use [`wrap_partition_value_in_dict`] to wrap the values. +/// Use [`wrap_partition_value_in_dict`] to wrap a [`ScalarValue`] in the same say. pub fn wrap_partition_type_in_dict(val_type: DataType) -> DataType { DataType::Dictionary(Box::new(DataType::UInt16), Box::new(val_type)) } -/// Convert scalar value of partition columns to physical type: `Dictionary(UInt16, val_type)` . -/// -/// You CAN use this to specify types for partition columns. However you MAY also choose not to dictionary-encode the -/// data or to use a different dictionary type. -/// -/// Use [`wrap_partition_type_in_dict`] to wrap the types. +/// Convert a [`ScalarValue`] of partition columns to a type, as +/// decribed in the documentation of [`wrap_partition_type_in_dict`], +/// which can wrap the types. pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue { ScalarValue::Dictionary(Box::new(DataType::UInt16), Box::new(val)) }