Skip to content

Commit

Permalink
Minor: Fix doc links and typos (#5225)
Browse files Browse the repository at this point in the history
* Fix doc links and typos

* Format md

* Fix docs
  • Loading branch information
Jefffrey authored Feb 10, 2023
1 parent 8f3c8c3 commit eda875b
Show file tree
Hide file tree
Showing 24 changed files with 229 additions and 192 deletions.
22 changes: 11 additions & 11 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ config_namespace! {
config_namespace! {
/// Options related to SQL parser
pub struct SqlParserOptions {
/// When set to true, sql parser will parse float as decimal type
/// When set to true, SQL parser will parse float as decimal type
pub parse_float_as_decimal: bool, default = false

/// When set to true, sql parser will normalize ident(convert ident to lowercase when not quoted)
/// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
pub enable_ident_normalization: bool, default = true

}
Expand All @@ -194,7 +194,7 @@ config_namespace! {
/// Options related to query execution
pub struct ExecutionOptions {
/// Default batch size while creating new batches, it's especially useful for
/// buffer-in-memory batches since creating tiny batches would results in too much
/// buffer-in-memory batches since creating tiny batches would result in too much
/// metadata memory consumption
pub batch_size: usize, default = 8192

Expand All @@ -208,12 +208,12 @@ config_namespace! {
pub collect_statistics: bool, default = false

/// Number of partitions for query execution. Increasing partitions can increase
/// concurrency. Defaults to the number of cpu cores on the system
/// concurrency. Defaults to the number of CPU cores on the system
pub target_partitions: usize, default = num_cpus::get()

/// The default time zone
///
/// Some functions, e.g. EXTRACT(HOUR from SOME_TIME), shift the underlying datetime
/// Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime
/// according to this time zone, and then extract the hour
pub time_zone: Option<String>, default = Some("+00:00".into())

Expand All @@ -240,7 +240,7 @@ config_namespace! {
pub skip_metadata: bool, default = true

/// If specified, the parquet reader will try and fetch the last `size_hint`
/// bytes of the parquet file optimistically. If not specified, two read are required:
/// bytes of the parquet file optimistically. If not specified, two reads are required:
/// One read to fetch the 8-byte parquet footer and
/// another to fetch the metadata length encoded in the footer
pub metadata_size_hint: Option<usize>, default = None
Expand All @@ -260,7 +260,7 @@ config_namespace! {
/// Options related to query optimization
pub struct OptimizerOptions {
/// When set to true, the physical plan optimizer will try to add round robin
/// repartition to increase parallelism to leverage more CPU cores
/// repartitioning to increase parallelism to leverage more CPU cores
pub enable_round_robin_repartition: bool, default = true

/// When set to true, the optimizer will insert filters before a join between
Expand All @@ -270,14 +270,14 @@ config_namespace! {
pub filter_null_join_keys: bool, default = false

/// Should DataFusion repartition data using the aggregate keys to execute aggregates
/// in parallel using the provided `target_partitions` level"
/// in parallel using the provided `target_partitions` level
pub repartition_aggregations: bool, default = true

/// Minimum total files size in bytes to perform file scan repartitioning.
pub repartition_file_min_size: usize, default = 10 * 1024 * 1024

/// Should DataFusion repartition data using the join keys to execute joins in parallel
/// using the provided `target_partitions` level"
/// using the provided `target_partitions` level
pub repartition_joins: bool, default = true

/// When set to true, file groups will be repartitioned to achieve maximum parallelism.
Expand All @@ -287,11 +287,11 @@ config_namespace! {
pub repartition_file_scans: bool, default = false

/// Should DataFusion repartition data using the partitions keys to execute window
/// functions in parallel using the provided `target_partitions` level"
/// functions in parallel using the provided `target_partitions` level
pub repartition_windows: bool, default = true

/// Should DataFusion execute sorts in a per-partition fashion and merge
/// afterwards instead of coalescing first and sorting globally
/// afterwards instead of coalescing first and sorting globally.
/// With this flag is enabled, plans in the form below
/// "SortExec: [a@0 ASC]",
/// " CoalescePartitionsExec",
Expand Down
6 changes: 4 additions & 2 deletions datafusion/core/src/datasource/file_format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

//! Module containing helper methods for the various file formats
/// default max records to scan to infer the schema
/// Default max records to scan to infer the schema
pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;

pub mod avro;
Expand All @@ -41,8 +41,10 @@ use async_trait::async_trait;
use object_store::{ObjectMeta, ObjectStore};

/// This trait abstracts all the file format specific implementations
/// from the `TableProvider`. This helps code re-utilization across
/// from the [`TableProvider`]. This helps code re-utilization across
/// providers that support the the same file formats.
///
/// [`TableProvider`]: crate::datasource::datasource::TableProvider
#[async_trait]
pub trait FileFormat: Send + Sync + fmt::Debug {
/// Returns the table provider as [`Any`](std::any::Any) so that it can be
Expand Down
24 changes: 13 additions & 11 deletions datafusion/core/src/datasource/file_format/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,39 +61,39 @@ pub const DEFAULT_PARQUET_EXTENSION: &str = ".parquet";
/// <https://github.com/apache/arrow-datafusion/issues/4349>
#[derive(Debug, Default)]
pub struct ParquetFormat {
/// Override the global setting for enable_pruning
/// Override the global setting for `enable_pruning`
enable_pruning: Option<bool>,
/// Override the global setting for metadata_size_hint
/// Override the global setting for `metadata_size_hint`
metadata_size_hint: Option<usize>,
/// Override the global setting for skip_metadata
/// Override the global setting for `skip_metadata`
skip_metadata: Option<bool>,
}

impl ParquetFormat {
/// construct a new Format with no local overrides
/// Construct a new Format with no local overrides
pub fn new() -> Self {
Self::default()
}

/// Activate statistics based row group level pruning
/// - If None, defaults to value on `config_options`
/// - If `None`, defaults to value on `config_options`
pub fn with_enable_pruning(mut self, enable: Option<bool>) -> Self {
self.enable_pruning = enable;
self
}

/// Return true if pruning is enabled
/// Return `true` if pruning is enabled
pub fn enable_pruning(&self, config_options: &ConfigOptions) -> bool {
self.enable_pruning
.unwrap_or(config_options.execution.parquet.pruning)
}

/// Provide a hint to the size of the file metadata. If a hint is provided
/// the reader will try and fetch the last `size_hint` bytes of the parquet file optimistically.
/// With out a hint, two read are required. One read to fetch the 8-byte parquet footer and then
/// Without a hint, two read are required. One read to fetch the 8-byte parquet footer and then
/// another read to fetch the metadata length encoded in the footer.
///
/// - If None, defaults to value on `config_options`
/// - If `None`, defaults to value on `config_options`
pub fn with_metadata_size_hint(mut self, size_hint: Option<usize>) -> Self {
self.metadata_size_hint = size_hint;
self
Expand All @@ -109,13 +109,13 @@ impl ParquetFormat {
/// the file Schema. This can help avoid schema conflicts due to
/// metadata.
///
/// - If None, defaults to value on `config_options`
/// - If `None`, defaults to value on `config_options`
pub fn with_skip_metadata(mut self, skip_metadata: Option<bool>) -> Self {
self.skip_metadata = skip_metadata;
self
}

/// returns true if schema metadata will be cleared prior to
/// Returns `true` if schema metadata will be cleared prior to
/// schema merging.
pub fn skip_metadata(&self, config_options: &ConfigOptions) -> bool {
self.skip_metadata
Expand Down Expand Up @@ -375,7 +375,9 @@ fn summarize_min_max(
/// Fetches parquet metadata from ObjectStore for given object
///
/// This component is a subject to **change** in near future and is exposed for low level integrations
/// through [ParquetFileReaderFactory].
/// through [`ParquetFileReaderFactory`].
///
/// [`ParquetFileReaderFactory`]: crate::physical_plan::file_format::ParquetFileReaderFactory
pub async fn fetch_parquet_metadata(
store: &dyn ObjectStore,
meta: &ObjectMeta,
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/src/datasource/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

//! In-memory data source for presenting a Vec<RecordBatch> as a data source that can be
//! In-memory data source for presenting a `Vec<RecordBatch>` as a data source that can be
//! queried by DataFusion. This allows data to be pre-loaded into memory and then
//! repeatedly queried without incurring additional file I/O overhead.
Expand Down
46 changes: 23 additions & 23 deletions datafusion/core/src/execution/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ use super::options::{
/// ```
#[derive(Clone)]
pub struct SessionContext {
/// Uuid for the session
/// UUID for the session
session_id: String,
/// Session start time
session_start_time: DateTime<Utc>,
Expand All @@ -169,7 +169,7 @@ impl SessionContext {
Self::with_config(SessionConfig::new())
}

/// Finds any ListSchemaProviders and instructs them to reload tables from "disk"
/// Finds any [`ListingSchemaProvider`]s and instructs them to reload tables from "disk"
pub async fn refresh_catalogs(&self) -> Result<()> {
let cat_names = self.catalog_names().clone();
for cat_name in cat_names.iter() {
Expand All @@ -195,7 +195,7 @@ impl SessionContext {
Self::with_config_rt(config, runtime)
}

/// Creates a new session context using the provided configuration and RuntimeEnv.
/// Creates a new session context using the provided configuration and [`RuntimeEnv`].
pub fn with_config_rt(config: SessionConfig, runtime: Arc<RuntimeEnv>) -> Self {
let state = SessionState::with_config_rt(config, runtime);
Self::with_state(state)
Expand Down Expand Up @@ -235,12 +235,12 @@ impl SessionContext {
self.state.read().runtime_env.clone()
}

/// Return the session_id of this Session
/// Return the `session_id` of this Session
pub fn session_id(&self) -> String {
self.session_id.clone()
}

/// Return the enable_ident_normalization of this Session
/// Return the `enable_ident_normalization` of this Session
pub fn enable_ident_normalization(&self) -> bool {
self.state
.read()
Expand All @@ -257,7 +257,7 @@ impl SessionContext {

/// Creates a [`DataFrame`] that will execute a SQL query.
///
/// Note: This api implements DDL such as `CREATE TABLE` and `CREATE VIEW` with in memory
/// Note: This API implements DDL such as `CREATE TABLE` and `CREATE VIEW` with in-memory
/// default implementations.
///
/// If this is not desirable, consider using [`SessionState::create_logical_plan()`] which
Expand Down Expand Up @@ -661,7 +661,7 @@ impl SessionContext {
self._read_type(table_path, options).await
}

/// Creates a [`DataFrame`] for reading an Json data source.
/// Creates a [`DataFrame`] for reading an JSON data source.
///
/// For more control such as reading multiple files, you can use
/// [`read_table`](Self::read_table) with a [`ListingTable`].
Expand Down Expand Up @@ -788,7 +788,7 @@ impl SessionContext {
Ok(())
}

/// Registers a Json file as a table that it can be referenced
/// Registers a JSON file as a table that it can be referenced
/// from SQL statements executed against this context.
pub async fn register_json(
&self,
Expand Down Expand Up @@ -906,7 +906,7 @@ impl SessionContext {
.deregister_table(&table)
}

/// Return true if the specified table exists in the schema provider.
/// Return `true` if the specified table exists in the schema provider.
pub fn table_exist<'a>(
&'a self,
table_ref: impl Into<TableReference<'a>>,
Expand Down Expand Up @@ -1105,8 +1105,8 @@ type AnyMap =

/// Hasher for [`AnyMap`].
///
/// With [`TypeId`}s as keys, there's no need to hash them. They are already hashes themselves, coming from the compiler.
/// The [`IdHasher`} just holds the [`u64`} of the [`TypeId`}, and then returns it, instead of doing any bit fiddling.
/// With [`TypeId`]s as keys, there's no need to hash them. They are already hashes themselves, coming from the compiler.
/// The [`IdHasher`] just holds the [`u64`] of the [`TypeId`], and then returns it, instead of doing any bit fiddling.
#[derive(Default)]
struct IdHasher(u64);

Expand Down Expand Up @@ -1194,15 +1194,19 @@ impl SessionConfig {
self
}

/// Customize [`OPT_TARGET_PARTITIONS`]
/// Customize [`target_partitions`]
///
/// [`target_partitions`]: crate::config::ExecutionOptions::target_partitions
pub fn with_target_partitions(mut self, n: usize) -> Self {
// partition count must be greater than zero
assert!(n > 0);
self.options.execution.target_partitions = n;
self
}

/// get target_partitions
/// Get [`target_partitions`]
///
/// [`target_partitions`]: crate::config::ExecutionOptions::target_partitions
pub fn target_partitions(&self) -> usize {
self.options.execution.target_partitions
}
Expand Down Expand Up @@ -1330,7 +1334,7 @@ impl SessionConfig {
/// Note that this method will eventually be deprecated and
/// replaced by [`config_options`].
///
/// [`config_options`]: SessionContext::config_option
/// [`config_options`]: Self::config_options
pub fn to_props(&self) -> HashMap<String, String> {
let mut map = HashMap::new();
// copy configs from config_options
Expand All @@ -1342,15 +1346,11 @@ impl SessionConfig {
}

/// Return a handle to the configuration options.
///
/// [`config_options`]: SessionContext::config_option
pub fn config_options(&self) -> &ConfigOptions {
&self.options
}

/// Return a mutable handle to the configuration options.
///
/// [`config_options`]: SessionContext::config_option
pub fn config_options_mut(&mut self) -> &mut ConfigOptions {
&mut self.options
}
Expand Down Expand Up @@ -1436,7 +1436,7 @@ impl From<ConfigOptions> for SessionConfig {
/// Execution context for registering data sources and executing queries
#[derive(Clone)]
pub struct SessionState {
/// Uuid for the session
/// UUID for the session
session_id: String,
/// Responsible for optimizing a logical plan
optimizer: Optimizer,
Expand Down Expand Up @@ -1691,7 +1691,7 @@ impl SessionState {
self
}

/// Convert a sql string into an ast Statement
/// Convert a SQL string into an AST Statement
pub fn sql_to_statement(
&self,
sql: &str,
Expand All @@ -1710,7 +1710,7 @@ impl SessionState {
Ok(statement)
}

/// Convert an ast Statement into a LogicalPlan
/// Convert an AST Statement into a LogicalPlan
pub async fn statement_to_plan(
&self,
statement: datafusion_sql::parser::Statement,
Expand Down Expand Up @@ -2035,12 +2035,12 @@ impl TaskContext {
&self.session_config
}

/// Return the session_id of this [TaskContext]
/// Return the `session_id` of this [TaskContext]
pub fn session_id(&self) -> String {
self.session_id.clone()
}

/// Return the task_id of this [TaskContext]
/// Return the `task_id` of this [TaskContext]
pub fn task_id(&self) -> Option<String> {
self.task_id.clone()
}
Expand Down
Loading

0 comments on commit eda875b

Please sign in to comment.