Minor: Fix doc links and typos (#5225)

* Fix doc links and typos * Format md * Fix docs
apache · Feb 10, 2023 · eda875b · eda875b
1 parent 8f3c8c3
commit eda875b
Show file tree

Hide file tree

Showing 24 changed files with 229 additions and 192 deletions.
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -181,10 +181,10 @@ config_namespace! {
 config_namespace! {
     /// Options related to SQL parser
     pub struct SqlParserOptions {
-        /// When set to true, sql parser will parse float as decimal type
+        /// When set to true, SQL parser will parse float as decimal type
         pub parse_float_as_decimal: bool, default = false
 
-        /// When set to true, sql parser will normalize ident(convert ident to lowercase when not quoted)
+        /// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
         pub enable_ident_normalization: bool, default = true
 
     }
@@ -194,7 +194,7 @@ config_namespace! {
     /// Options related to query execution
     pub struct ExecutionOptions {
         /// Default batch size while creating new batches, it's especially useful for
-        /// buffer-in-memory batches since creating tiny batches would results in too much
+        /// buffer-in-memory batches since creating tiny batches would result in too much
         /// metadata memory consumption
         pub batch_size: usize, default = 8192
 
@@ -208,12 +208,12 @@ config_namespace! {
         pub collect_statistics: bool, default = false
 
         /// Number of partitions for query execution. Increasing partitions can increase
-        /// concurrency. Defaults to the number of cpu cores on the system
+        /// concurrency. Defaults to the number of CPU cores on the system
         pub target_partitions: usize, default = num_cpus::get()
 
         /// The default time zone
         ///
-        /// Some functions, e.g. EXTRACT(HOUR from SOME_TIME), shift the underlying datetime
+        /// Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime
         /// according to this time zone, and then extract the hour
         pub time_zone: Option<String>, default = Some("+00:00".into())
 
@@ -240,7 +240,7 @@ config_namespace! {
         pub skip_metadata: bool, default = true
 
         /// If specified, the parquet reader will try and fetch the last `size_hint`
-        /// bytes of the parquet file optimistically. If not specified, two read are required:
+        /// bytes of the parquet file optimistically. If not specified, two reads are required:
         /// One read to fetch the 8-byte parquet footer and
         /// another to fetch the metadata length encoded in the footer
         pub metadata_size_hint: Option<usize>, default = None
@@ -260,7 +260,7 @@ config_namespace! {
     /// Options related to query optimization
     pub struct OptimizerOptions {
         /// When set to true, the physical plan optimizer will try to add round robin
-        /// repartition to increase parallelism to leverage more CPU cores
+        /// repartitioning to increase parallelism to leverage more CPU cores
         pub enable_round_robin_repartition: bool, default = true
 
         /// When set to true, the optimizer will insert filters before a join between
@@ -270,14 +270,14 @@ config_namespace! {
         pub filter_null_join_keys: bool, default = false
 
         /// Should DataFusion repartition data using the aggregate keys to execute aggregates
-        /// in parallel using the provided `target_partitions` level"
+        /// in parallel using the provided `target_partitions` level
         pub repartition_aggregations: bool, default = true
 
         /// Minimum total files size in bytes to perform file scan repartitioning.
         pub repartition_file_min_size: usize, default = 10 * 1024 * 1024
 
         /// Should DataFusion repartition data using the join keys to execute joins in parallel
-        /// using the provided `target_partitions` level"
+        /// using the provided `target_partitions` level
         pub repartition_joins: bool, default = true
 
         /// When set to true, file groups will be repartitioned to achieve maximum parallelism.
@@ -287,11 +287,11 @@ config_namespace! {
         pub repartition_file_scans: bool, default = false
 
         /// Should DataFusion repartition data using the partitions keys to execute window
-        /// functions in parallel using the provided `target_partitions` level"
+        /// functions in parallel using the provided `target_partitions` level
         pub repartition_windows: bool, default = true
 
         /// Should DataFusion execute sorts in a per-partition fashion and merge
-        /// afterwards instead of coalescing first and sorting globally
+        /// afterwards instead of coalescing first and sorting globally.
         /// With this flag is enabled, plans in the form below
         ///      "SortExec: [a@0 ASC]",
         ///      "  CoalescePartitionsExec",

diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs
@@ -17,7 +17,7 @@
 
 //! Module containing helper methods for the various file formats
 
-/// default max records to scan to infer the schema
+/// Default max records to scan to infer the schema
 pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;
 
 pub mod avro;
@@ -41,8 +41,10 @@ use async_trait::async_trait;
 use object_store::{ObjectMeta, ObjectStore};
 
 /// This trait abstracts all the file format specific implementations
-/// from the `TableProvider`. This helps code re-utilization across
+/// from the [`TableProvider`]. This helps code re-utilization across
 /// providers that support the the same file formats.
+///
+/// [`TableProvider`]: crate::datasource::datasource::TableProvider
 #[async_trait]
 pub trait FileFormat: Send + Sync + fmt::Debug {
     /// Returns the table provider as [`Any`](std::any::Any) so that it can be

diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -61,39 +61,39 @@ pub const DEFAULT_PARQUET_EXTENSION: &str = ".parquet";
 /// <https://github.com/apache/arrow-datafusion/issues/4349>
 #[derive(Debug, Default)]
 pub struct ParquetFormat {
-    /// Override the global setting for enable_pruning
+    /// Override the global setting for `enable_pruning`
     enable_pruning: Option<bool>,
-    /// Override the global setting for metadata_size_hint
+    /// Override the global setting for `metadata_size_hint`
     metadata_size_hint: Option<usize>,
-    /// Override the global setting for skip_metadata
+    /// Override the global setting for `skip_metadata`
     skip_metadata: Option<bool>,
 }
 
 impl ParquetFormat {
-    /// construct a new Format with no local overrides
+    /// Construct a new Format with no local overrides
     pub fn new() -> Self {
         Self::default()
     }
 
     /// Activate statistics based row group level pruning
-    /// - If None, defaults to value on `config_options`
+    /// - If `None`, defaults to value on `config_options`
     pub fn with_enable_pruning(mut self, enable: Option<bool>) -> Self {
         self.enable_pruning = enable;
         self
     }
 
-    /// Return true if pruning is enabled
+    /// Return `true` if pruning is enabled
     pub fn enable_pruning(&self, config_options: &ConfigOptions) -> bool {
         self.enable_pruning
             .unwrap_or(config_options.execution.parquet.pruning)
     }
 
     /// Provide a hint to the size of the file metadata. If a hint is provided
     /// the reader will try and fetch the last `size_hint` bytes of the parquet file optimistically.
-    /// With out a hint, two read are required. One read to fetch the 8-byte parquet footer and then
+    /// Without a hint, two read are required. One read to fetch the 8-byte parquet footer and then
     /// another read to fetch the metadata length encoded in the footer.
     ///
-    /// - If None, defaults to value on `config_options`
+    /// - If `None`, defaults to value on `config_options`
     pub fn with_metadata_size_hint(mut self, size_hint: Option<usize>) -> Self {
         self.metadata_size_hint = size_hint;
         self
@@ -109,13 +109,13 @@ impl ParquetFormat {
     /// the file Schema. This can help avoid schema conflicts due to
     /// metadata.
     ///
-    /// - If None, defaults to value on `config_options`
+    /// - If `None`, defaults to value on `config_options`
     pub fn with_skip_metadata(mut self, skip_metadata: Option<bool>) -> Self {
         self.skip_metadata = skip_metadata;
         self
     }
 
-    /// returns true if schema metadata will be cleared prior to
+    /// Returns `true` if schema metadata will be cleared prior to
     /// schema merging.
     pub fn skip_metadata(&self, config_options: &ConfigOptions) -> bool {
         self.skip_metadata
@@ -375,7 +375,9 @@ fn summarize_min_max(
 /// Fetches parquet metadata from ObjectStore for given object
 ///
 /// This component is a subject to **change** in near future and is exposed for low level integrations
-/// through [ParquetFileReaderFactory].
+/// through [`ParquetFileReaderFactory`].
+///
+/// [`ParquetFileReaderFactory`]: crate::physical_plan::file_format::ParquetFileReaderFactory
 pub async fn fetch_parquet_metadata(
     store: &dyn ObjectStore,
     meta: &ObjectMeta,

diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! In-memory data source for presenting a Vec<RecordBatch> as a data source that can be
+//! In-memory data source for presenting a `Vec<RecordBatch>` as a data source that can be
 //! queried by DataFusion. This allows data to be pre-loaded into memory and then
 //! repeatedly queried without incurring additional file I/O overhead.
 

diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs
@@ -149,7 +149,7 @@ use super::options::{
 /// ```
 #[derive(Clone)]
 pub struct SessionContext {
-    /// Uuid for the session
+    /// UUID for the session
     session_id: String,
     /// Session start time
     session_start_time: DateTime<Utc>,
@@ -169,7 +169,7 @@ impl SessionContext {
         Self::with_config(SessionConfig::new())
     }
 
-    /// Finds any ListSchemaProviders and instructs them to reload tables from "disk"
+    /// Finds any [`ListingSchemaProvider`]s and instructs them to reload tables from "disk"
     pub async fn refresh_catalogs(&self) -> Result<()> {
         let cat_names = self.catalog_names().clone();
         for cat_name in cat_names.iter() {
@@ -195,7 +195,7 @@ impl SessionContext {
         Self::with_config_rt(config, runtime)
     }
 
-    /// Creates a new session context using the provided configuration and RuntimeEnv.
+    /// Creates a new session context using the provided configuration and [`RuntimeEnv`].
     pub fn with_config_rt(config: SessionConfig, runtime: Arc<RuntimeEnv>) -> Self {
         let state = SessionState::with_config_rt(config, runtime);
         Self::with_state(state)
@@ -235,12 +235,12 @@ impl SessionContext {
         self.state.read().runtime_env.clone()
     }
 
-    /// Return the session_id of this Session
+    /// Return the `session_id` of this Session
     pub fn session_id(&self) -> String {
         self.session_id.clone()
     }
 
-    /// Return the enable_ident_normalization of this Session
+    /// Return the `enable_ident_normalization` of this Session
     pub fn enable_ident_normalization(&self) -> bool {
         self.state
             .read()
@@ -257,7 +257,7 @@ impl SessionContext {
 
     /// Creates a [`DataFrame`] that will execute a SQL query.
     ///
-    /// Note: This api implements DDL such as `CREATE TABLE` and `CREATE VIEW` with in memory
+    /// Note: This API implements DDL such as `CREATE TABLE` and `CREATE VIEW` with in-memory
     /// default implementations.
     ///
     /// If this is not desirable, consider using [`SessionState::create_logical_plan()`] which
@@ -661,7 +661,7 @@ impl SessionContext {
         self._read_type(table_path, options).await
     }
 
-    /// Creates a [`DataFrame`] for reading an Json data source.
+    /// Creates a [`DataFrame`] for reading an JSON data source.
     ///
     /// For more control such as reading multiple files, you can use
     /// [`read_table`](Self::read_table) with a [`ListingTable`].
@@ -788,7 +788,7 @@ impl SessionContext {
         Ok(())
     }
 
-    /// Registers a Json file as a table that it can be referenced
+    /// Registers a JSON file as a table that it can be referenced
     /// from SQL statements executed against this context.
     pub async fn register_json(
         &self,
@@ -906,7 +906,7 @@ impl SessionContext {
             .deregister_table(&table)
     }
 
-    /// Return true if the specified table exists in the schema provider.
+    /// Return `true` if the specified table exists in the schema provider.
     pub fn table_exist<'a>(
         &'a self,
         table_ref: impl Into<TableReference<'a>>,
@@ -1105,8 +1105,8 @@ type AnyMap =
 
 /// Hasher for [`AnyMap`].
 ///
-/// With [`TypeId`}s as keys, there's no need to hash them. They are already hashes themselves, coming from the compiler.
-/// The [`IdHasher`} just holds the [`u64`} of the [`TypeId`}, and then returns it, instead of doing any bit fiddling.
+/// With [`TypeId`]s as keys, there's no need to hash them. They are already hashes themselves, coming from the compiler.
+/// The [`IdHasher`] just holds the [`u64`] of the [`TypeId`], and then returns it, instead of doing any bit fiddling.
 #[derive(Default)]
 struct IdHasher(u64);
 
@@ -1194,15 +1194,19 @@ impl SessionConfig {
         self
     }
 
-    /// Customize [`OPT_TARGET_PARTITIONS`]
+    /// Customize [`target_partitions`]
+    ///
+    /// [`target_partitions`]: crate::config::ExecutionOptions::target_partitions
     pub fn with_target_partitions(mut self, n: usize) -> Self {
         // partition count must be greater than zero
         assert!(n > 0);
         self.options.execution.target_partitions = n;
         self
     }
 
-    /// get target_partitions
+    /// Get [`target_partitions`]
+    ///
+    /// [`target_partitions`]: crate::config::ExecutionOptions::target_partitions
     pub fn target_partitions(&self) -> usize {
         self.options.execution.target_partitions
     }
@@ -1330,7 +1334,7 @@ impl SessionConfig {
     /// Note that this method will eventually be deprecated and
     /// replaced by [`config_options`].
     ///
-    /// [`config_options`]: SessionContext::config_option
+    /// [`config_options`]: Self::config_options
     pub fn to_props(&self) -> HashMap<String, String> {
         let mut map = HashMap::new();
         // copy configs from config_options
@@ -1342,15 +1346,11 @@ impl SessionConfig {
     }
 
     /// Return a handle to the configuration options.
-    ///
-    /// [`config_options`]: SessionContext::config_option
     pub fn config_options(&self) -> &ConfigOptions {
         &self.options
     }
 
     /// Return a mutable handle to the configuration options.
-    ///
-    /// [`config_options`]: SessionContext::config_option
     pub fn config_options_mut(&mut self) -> &mut ConfigOptions {
         &mut self.options
     }
@@ -1436,7 +1436,7 @@ impl From<ConfigOptions> for SessionConfig {
 /// Execution context for registering data sources and executing queries
 #[derive(Clone)]
 pub struct SessionState {
-    /// Uuid for the session
+    /// UUID for the session
     session_id: String,
     /// Responsible for optimizing a logical plan
     optimizer: Optimizer,
@@ -1691,7 +1691,7 @@ impl SessionState {
         self
     }
 
-    /// Convert a sql string into an ast Statement
+    /// Convert a SQL string into an AST Statement
     pub fn sql_to_statement(
         &self,
         sql: &str,
@@ -1710,7 +1710,7 @@ impl SessionState {
         Ok(statement)
     }
 
-    /// Convert an ast Statement into a LogicalPlan
+    /// Convert an AST Statement into a LogicalPlan
     pub async fn statement_to_plan(
         &self,
         statement: datafusion_sql::parser::Statement,
@@ -2035,12 +2035,12 @@ impl TaskContext {
         &self.session_config
     }
 
-    /// Return the session_id of this [TaskContext]
+    /// Return the `session_id` of this [TaskContext]
     pub fn session_id(&self) -> String {
         self.session_id.clone()
     }
 
-    /// Return the task_id of this [TaskContext]
+    /// Return the `task_id` of this [TaskContext]
     pub fn task_id(&self) -> Option<String> {
         self.task_id.clone()
     }