From 8aee829127c149ec298197d91922bb59af8d1294 Mon Sep 17 00:00:00 2001 From: Clay McLeod Date: Tue, 6 Aug 2024 17:07:23 -0500 Subject: [PATCH] revise: adds extension trait and AST registry --- Cargo.toml | 1 + wdl-ast/src/lib.rs | 2 + wdl-ast/src/registry.rs | 241 +++++++++++++++++++++++++++++++ wdl-ast/src/v1/expr.rs | 2 +- wdl-config/src/loader.rs | 127 ++++++++++++++++ wdl-grammar/Cargo.toml | 2 + wdl-grammar/src/tree.rs | 304 ++++++++++++++++++++++++++++++++++++++- 7 files changed, 675 insertions(+), 4 deletions(-) create mode 100644 wdl-ast/src/registry.rs create mode 100644 wdl-config/src/loader.rs diff --git a/Cargo.toml b/Cargo.toml index afc190fca..4874253e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,3 +49,4 @@ futures = "0.3.30" glob = "0.3.1" path-clean = "1.0.1" indicatif = "0.17.8" +itertools = "0.13.0" diff --git a/wdl-ast/src/lib.rs b/wdl-ast/src/lib.rs index 038dc20e5..317e3d2fc 100644 --- a/wdl-ast/src/lib.rs +++ b/wdl-ast/src/lib.rs @@ -58,6 +58,8 @@ pub use wdl_grammar::WorkflowDescriptionLanguage; pub mod v1; +#[cfg(test)] +mod registry; mod validation; mod visitor; diff --git a/wdl-ast/src/registry.rs b/wdl-ast/src/registry.rs new file mode 100644 index 000000000..f0d98a63b --- /dev/null +++ b/wdl-ast/src/registry.rs @@ -0,0 +1,241 @@ +use std::any::TypeId; +use std::collections::HashMap; +use std::sync::LazyLock; + +use wdl_grammar::WorkflowDescriptionLanguage; +use wdl_grammar::ALL_SYNTAX_KIND; + +use crate::v1; +use crate::AstNode; +use crate::AstToken; +use crate::Comment; +use crate::Document; +use crate::Ident; +use crate::SyntaxKind; +use crate::Version; +use crate::VersionStatement; +use crate::Whitespace; + +/// A private module for sealed traits. +mod private { + /// The sealed trait for [`AstNodeRegistrant`](super::AstNodeRegistrant). + pub trait SealedNode {} + + /// The sealed trait for [`AstTokenRegistrant`](super::AstTokenRegistrant). + pub trait SealedToken {} +} + +/// A registry of all known mappings between types that implement [`AstNode`] +/// and the [`SyntaxKind`] they can map to. +static REGISTRY: LazyLock>> = LazyLock::new(|| { + let types = vec![ + Comment::register(), + Document::register(), + Ident::register(), + v1::AccessExpr::register(), + v1::AdditionExpr::register(), + v1::ArrayType::register(), + v1::Ast::register(), + v1::BoundDecl::register(), + v1::CallAfter::register(), + v1::CallAlias::register(), + v1::CallExpr::register(), + v1::CallInputItem::register(), + v1::CallTarget::register(), + v1::CommandSection::register(), + v1::CommandText::register(), + v1::ConditionalStatement::register(), + v1::Decl::register(), + v1::DefaultOption::register(), + v1::DivisionExpr::register(), + v1::DocumentItem::register(), + v1::EqualityExpr::register(), + v1::ExponentiationExpr::register(), + v1::Expr::register(), + v1::Float::register(), + v1::GreaterEqualExpr::register(), + v1::GreaterExpr::register(), + v1::HintsItem::register(), + v1::HintsSection::register(), + v1::IfExpr::register(), + v1::ImportAlias::register(), + v1::ImportStatement::register(), + v1::IndexExpr::register(), + v1::InequalityExpr::register(), + v1::InputSection::register(), + v1::Integer::register(), + v1::LessEqualExpr::register(), + v1::LessExpr::register(), + v1::LiteralArray::register(), + v1::LiteralBoolean::register(), + v1::LiteralExpr::register(), + v1::LiteralFloat::register(), + v1::LiteralHints::register(), + v1::LiteralHintsItem::register(), + v1::LiteralInput::register(), + v1::LiteralInputItem::register(), + v1::LiteralInteger::register(), + v1::LiteralMap::register(), + v1::LiteralMapItem::register(), + v1::LiteralNone::register(), + v1::LiteralObject::register(), + v1::LiteralObjectItem::register(), + v1::LiteralOutput::register(), + v1::LiteralOutputItem::register(), + v1::LiteralPair::register(), + v1::LiteralString::register(), + v1::LiteralStruct::register(), + v1::LiteralStructItem::register(), + v1::LogicalAndExpr::register(), + v1::LogicalNotExpr::register(), + v1::LogicalOrExpr::register(), + v1::MapType::register(), + v1::MetadataArray::register(), + v1::MetadataObjectItem::register(), + v1::MetadataSection::register(), + v1::ModuloExpr::register(), + v1::MultiplicationExpr::register(), + v1::NameRef::register(), + v1::NegationExpr::register(), + v1::ObjectType::register(), + v1::OutputSection::register(), + v1::PairType::register(), + v1::ParameterMetadataSection::register(), + v1::ParenthesizedExpr::register(), + v1::Placeholder::register(), + v1::PlaceholderOption::register(), + v1::PrimitiveType::register(), + v1::RequirementsItem::register(), + v1::RequirementsSection::register(), + v1::RuntimeItem::register(), + v1::RuntimeSection::register(), + v1::ScatterStatement::register(), + v1::SectionParent::register(), + v1::SepOption::register(), + v1::StringText::register(), + v1::StructDefinition::register(), + v1::StructItem::register(), + v1::SubtractionExpr::register(), + v1::TaskDefinition::register(), + v1::TaskItem::register(), + v1::TrueFalseOption::register(), + v1::Type::register(), + v1::TypeRef::register(), + v1::UnboundDecl::register(), + v1::WorkflowDefinition::register(), + v1::WorkflowItem::register(), + v1::WorkflowStatement::register(), + Version::register(), + VersionStatement::register(), + Whitespace::register(), + ]; + + let mut result = HashMap::new(); + + // NOTE: this is done this way instead of collecting to check on the fly to + // make sure that no keys are duplicated. + for (r#type, kinds) in types { + if result.contains_key(&r#type) { + panic!("the `{:?}` key is duplicated", r#type); + } + + result.insert(r#type, kinds); + } + + result +}); + +/// Computes the inverse of the registry (maps [`SyntaxKind`]s to every type +/// that can cast from them). + +fn inverse_registry() -> HashMap> { + let mut result = HashMap::>::new(); + + for (key, values) in REGISTRY.iter() { + for value in values.into_iter() { + result.entry(value.to_owned()).or_default().push(*key); + } + } + + result + .into_iter() + .map(|(key, values)| (key, values.into_boxed_slice())) + .collect() +} + +trait AstNodeRegistrant: private::SealedNode { + /// Registers the AST element. + fn register() -> (TypeId, Box<[SyntaxKind]>); +} + +impl + 'static> private::SealedNode for T {} + +impl + 'static> AstNodeRegistrant for T { + fn register() -> (TypeId, Box<[SyntaxKind]>) { + ( + TypeId::of::(), + ALL_SYNTAX_KIND + .iter() + .filter(|kind| T::can_cast(**kind)) + .cloned() + .collect::>() + .into_boxed_slice(), + ) + } +} + +trait AstTokenRegistrant: private::SealedToken { + /// Registers a type implementing `AstToken` that can be . + fn register() -> (TypeId, Box<[SyntaxKind]>); +} + +impl private::SealedToken for T {} + +impl AstTokenRegistrant for T { + fn register() -> (TypeId, Box<[SyntaxKind]>) { + ( + TypeId::of::(), + ALL_SYNTAX_KIND + .iter() + .filter(|kind| T::can_cast(**kind)) + .cloned() + .collect::>() + .into_boxed_slice(), + ) + } +} + +mod tests { + use super::*; + + #[test] + fn ensure_each_syntax_element_has_an_ast_node() { + let mut missing = Vec::new(); + let inverse_registry = inverse_registry(); + + for kind in ALL_SYNTAX_KIND { + // NOTE: these are pseudo elements and should not be reported. + if *kind == SyntaxKind::Abandoned || *kind == SyntaxKind::MAX { + continue; + } + + if !inverse_registry.contains_key(kind) { + missing.push(kind); + } + } + + if !missing.is_empty() { + let mut missing = missing + .into_iter() + .map(|kind| format!("{:?}", kind)) + .collect::>(); + missing.sort(); + + panic!( + "detected `SyntaxKind`s without an associated `AstNode` (n={}): {}", + missing.len(), + missing.join(", ") + ) + } + } +} diff --git a/wdl-ast/src/v1/expr.rs b/wdl-ast/src/v1/expr.rs index cca86f97d..d596274ac 100644 --- a/wdl-ast/src/v1/expr.rs +++ b/wdl-ast/src/v1/expr.rs @@ -3698,7 +3698,7 @@ task test { r#" version 1.1 -task test { +task test { Foo a = Foo { foo: "bar" } Bar b = Bar { bar: 1, baz: [1, 2, 3] } } diff --git a/wdl-config/src/loader.rs b/wdl-config/src/loader.rs new file mode 100644 index 000000000..6d7e605e2 --- /dev/null +++ b/wdl-config/src/loader.rs @@ -0,0 +1,127 @@ +use std::collections::VecDeque; +use std::convert::Infallible; +use std::path::PathBuf; + +use config::ConfigError; +use config::Environment; +use config::File; + +use crate::providers::EnvProvider; +use crate::providers::FileProvider; +use crate::BoxedProvider; +use crate::Config; +use crate::Provider; +use crate::CONFIG_SEARCH_PATHS; + +#[derive(Debug)] +pub enum Error { + /// An error from the `config` crate. + Config(ConfigError), +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::Config(err) => write!(f, "`config` error: {err}"), + } + } +} + +impl std::error::Error for Error {} + +/// A [`Result`](std::result::Result) with an [`Error`]. +pub type Result = std::result::Result; + +pub struct Loader(VecDeque); + +impl Loader { + /// Creates an empty [`Loader`]. + pub fn empty() -> Self { + Self(VecDeque::new()) + } + + /// Adds the default configuration to the front of the provider stack. + pub fn with_default_configuration(mut self) -> Self { + // NOTE: default configuration should always be the first provider evaluated. + self.0.push_front(Config::default().into()); + self + } + + /// Adds a file to the search path of the [`Loader`]. + /// + /// Note that the file is not required to be present. + pub fn add_optional_file(mut self, path: PathBuf) -> Self { + self.0.push_back(FileProvider::optional(path).into()); + self + } + + /// Adds a file to the search path of the [`Loader`]. + /// + /// Note that the file is required to be present. + pub fn add_required_file(mut self, path: PathBuf) -> Self { + self.0.push_back(FileProvider::required(path).into()); + self + } + + /// Adds the default search paths to the [`Loader`]. + pub fn with_default_search_paths(mut self) -> Self { + for path in CONFIG_SEARCH_PATHS.clone().into_iter() { + self = self.add_optional_file(path); + } + + self + } + + /// Adds a new environment prefix to the [`Loader`]. + pub fn add_env_prefix(mut self, prefix: &str) -> Self { + self.0.push_back(EnvProvider::new(prefix).into()); + self + } + + /// Adds the default environment prefix to the [`Loader`]. + pub fn with_default_env_prefix(mut self) -> Self { + self.0.push_back(EnvProvider::default().into()); + self + } + + /// Gets a reference to the inner [`ConfigBuilder`]. + pub fn inner(&self) -> &VecDeque { + &self.0 + } + + /// Consumes `self` and returns the inner [`ConfigBuilder`]. + pub fn into_inner(self) -> VecDeque { + self.0 + } + + /// Consumes `self` and attempts to load the [`Config`]. + pub fn try_load(self) -> std::result::Result> { + for provider in self.0 { + let config = provider.provide().map_err(|e| ); + } + + self.0 + .build() + .map_err(Error::Config)? + .try_deserialize() + .map_err(Error::Config) + } +} + +impl Default for Loader { + fn default() -> Self { + Self::empty() + .with_default_search_paths() + .with_default_env_prefix() + } +} + +#[cfg(test)] +mod tests { + use crate::Loader; + + #[test] + fn an_empty_loader_unwraps() { + Loader::empty(); + } +} diff --git a/wdl-grammar/Cargo.toml b/wdl-grammar/Cargo.toml index bc02025b9..2ab8e4090 100644 --- a/wdl-grammar/Cargo.toml +++ b/wdl-grammar/Cargo.toml @@ -10,8 +10,10 @@ repository = "https://github.com/stjude-rust-labs/wdl" documentation = "https://docs.rs/wdl-grammar" [dependencies] +itertools = { workspace = true } logos = { workspace = true } rowan = { workspace = true } +strum = { version = "0.26", features = ["derive"] } codespan-reporting = { workspace = true, optional = true } [dev-dependencies] diff --git a/wdl-grammar/src/tree.rs b/wdl-grammar/src/tree.rs index 31e5013ae..87622378e 100644 --- a/wdl-grammar/src/tree.rs +++ b/wdl-grammar/src/tree.rs @@ -3,10 +3,15 @@ pub mod dive; use std::borrow::Cow; +use std::collections::VecDeque; use std::fmt; +use itertools::FoldWhile; +use itertools::Itertools as _; +use rowan::Direction; use rowan::GreenNodeBuilder; use rowan::GreenNodeData; +use strum::VariantArray; use super::grammar; use super::lexer::Lexer; @@ -22,7 +27,7 @@ use crate::parser::Parser; /// Tokens are terminal and represent any span of the source. /// /// This enumeration is a union of all supported WDL tokens and nodes. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, VariantArray)] #[repr(u16)] pub enum SyntaxKind { /// The token is unknown to WDL. @@ -251,9 +256,9 @@ pub enum SyntaxKind { MetadataObjectNode, /// Represents a metadata array node. MetadataArrayNode, - /// Represents a literal integer node. + /// Represents a literal integer node. LiteralIntegerNode, - /// Represents a literal float node. + /// Represents a literal float node. LiteralFloatNode, /// Represents a literal boolean node. LiteralBooleanNode, @@ -363,6 +368,9 @@ pub enum SyntaxKind { MAX, } +/// Every [`SyntaxKind`] variant. +pub static ALL_SYNTAX_KIND: &[SyntaxKind] = SyntaxKind::VARIANTS; + impl From for rowan::SyntaxKind { fn from(kind: SyntaxKind) -> Self { rowan::SyntaxKind(kind as u16) @@ -505,3 +513,293 @@ impl fmt::Debug for SyntaxTree { self.0.fmt(f) } } + +/// Gathers comments from a stream of [`SyntaxElement`]s and groups comments +/// logically from that stream (separated by whitespace). +fn gather_comments( + source: &T, + direction: Direction, + break_on_newline: bool, +) -> Box<[String]> { + let iter = source.siblings_with_tokens(direction); + + /// Adds the text to the currently collecting buffer in the right place + /// depending in the direction we are traversing. + fn extend_buffer(text: String, buffer: &mut VecDeque, direction: &Direction) { + match direction { + Direction::Next => buffer.push_back(text), + Direction::Prev => buffer.push_front(text), + } + } + + let (mut comments, buffer) = iter + .skip_while(|e| source.matches(e)) + .take_while(|e| matches!(e.kind(), SyntaxKind::Comment | SyntaxKind::Whitespace)) + .fold_while( + (VecDeque::new(), VecDeque::new()), + |(mut results, mut buffer), e| { + match e.kind() { + SyntaxKind::Comment => { + let text = e + .into_token() + .expect("comment should always be a token") + .to_string() + // NOTE: only `trim_end()` is needed here + // because the beginning is trimmed in the + // `skip_while` below (after the comment + // character). + .trim_end() + .chars() + .skip_while(|c| *c == '#' || c.is_whitespace()) + // TODO(clay): perhaps there is a way to avoid + // an allocation here, but I cannot think of a + // simple one. + .collect::(); + + extend_buffer(text, &mut buffer, &direction); + } + SyntaxKind::Whitespace => { + let newlines = e + .into_token() + .expect("whitespace should always be a token") + .to_string() + .chars() + .filter(|c| *c == '\n') + .count(); + + if break_on_newline && newlines > 0 { + return FoldWhile::Done((results, buffer)); + } + + // If there is more than one newline in the + // whitespace token, then there is whitespace + // separating two comments and the result should be + // cut/pushed, and the buffer should be reset. + if newlines > 1 { + results.push_front(std::mem::take(&mut buffer).into_iter().join(" ")); + } + } + // SAFETY: we just filtered out any non-comment and + // non-whitespace nodes above, so this should never occur. + _ => unreachable!(), + } + + FoldWhile::Continue((results, buffer)) + }, + ) + .into_inner(); + + if !buffer.is_empty() { + comments.push_front(buffer.into_iter().join(" ")); + } + + // NOTE: most of the time, this conversion will be O(1). Occassionally + // it will be O(n). No allocations will ever be done. Thus, the + // ammortized cost of this is quite cheap. + Vec::from(comments).into_boxed_slice() +} + +/// An extension trait for [`SyntaxNode`]s, [`SyntaxToken`]s, and +/// [`SyntaxElement`]s. +pub trait SyntaxExt { + /// Returns whether `self` matches the provided element. + fn matches(&self, other: &SyntaxElement) -> bool; + + /// Gets the siblings with tokens. + /// + /// **NOTE:** this needed because Rowan does not encapsulate this + /// functionality in a trait. Once wrapped here, most of the functions + /// provided by this extension trait can just be provided, which simplifies + /// the code. Generally speaking, this should just defer to the underlying + /// `siblings_with_tokens` method for each type. + fn siblings_with_tokens(&self, direction: Direction) -> impl Iterator; + + /// Returns all of the siblings _before_ the current element. + /// + /// The siblings are returned in the order they were parsed. + fn preceding_siblings(&self) -> Box<[SyntaxElement]> { + let mut results = VecDeque::new(); + + self.siblings_with_tokens(Direction::Prev) + // NOTE: this `skip_while` is necessary because + // `siblings_with_tokens` returns the current node. + .skip_while(|e| self.matches(e)) + .for_each(|e| results.push_front(e)); + + // NOTE: most of the time, this conversion will be O(1). Occassionally + // it will be O(n). No allocations will ever be done. Thus, the + // ammortized cost of this is quite cheap. + Vec::from(results).into_boxed_slice() + } + + /// Returns all of the siblings _after_ the current element. + /// + /// The siblings are returned in the order they were parsed. + fn succeeding_siblings(&self) -> Box<[SyntaxElement]> { + let mut results = Vec::new(); + + self.siblings_with_tokens(Direction::Next) + // NOTE: this `skip_while` is necessary because + // `siblings_with_tokens` returns the current node. + .skip_while(|e| self.matches(e)) + .for_each(|e| results.push(e)); + + // NOTE: this should always be O(1) and never require any additional + // allocations. + results.into_boxed_slice() + } + + /// Gets all elements that are adjacent to a particular element (not + /// including the element itself). This means in both the forward and + /// reverse direction. + /// + /// The siblings are returned in the order they were parsed. + fn adjacent(&self) -> Box<[SyntaxElement]> { + let mut results = Vec::from(self.preceding_siblings()); + results.extend(self.succeeding_siblings().iter().cloned()); + + // NOTE: this should always be O(1) and never require any additional + // allocations. + results.into_boxed_slice() + } + + /// Gets all of the preceding comments for an element. + fn preceding_comments(&self) -> Box<[String]> + where + Self: Sized, + { + gather_comments(self, Direction::Prev, false) + } + + /// Gets all of the succeeding comments for an element. + fn succeeding_comments(&self) -> Box<[String]> + where + Self: Sized, + { + gather_comments(self, Direction::Next, false) + } + + /// Gets all of the inline comments directly following an element on the + /// same line. + fn inline_comment(&self) -> Option + where + Self: Sized, + { + gather_comments(self, Direction::Next, true) + // NOTE: at most, there can be one contiguous comment on a line. + .first() + .cloned() + } +} + +impl SyntaxExt for SyntaxNode { + fn matches(&self, other: &SyntaxElement) -> bool { + other.as_node().map(|n| n == self).unwrap_or(false) + } + + fn siblings_with_tokens(&self, direction: Direction) -> impl Iterator { + self.siblings_with_tokens(direction) + } +} + +impl SyntaxExt for SyntaxToken { + fn matches(&self, other: &SyntaxElement) -> bool { + other.as_token().map(|n| n == self).unwrap_or(false) + } + + fn siblings_with_tokens(&self, direction: Direction) -> impl Iterator { + self.siblings_with_tokens(direction) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::SyntaxTree; + + #[test] + fn preceding_comments() { + let (tree, diagnostics) = SyntaxTree::parse( + "version 1.2 + +# This comment should not be included +task foo {} + +# Some +# comments +# are +# long + +# Others are short + +# and, yet another +workflow foo {} # This should not be collected. + +# This comment should not be included either.", + ); + + assert!(diagnostics.is_empty()); + + let workflow = tree.root().last_child().unwrap(); + assert_eq!(workflow.kind(), SyntaxKind::WorkflowDefinitionNode); + assert_eq!( + workflow.preceding_comments().as_ref(), + vec![ + "Some comments are long", + "Others are short", + "and, yet another" + ] + ); + } + + #[test] + fn succeeding_comments() { + let (tree, diagnostics) = SyntaxTree::parse( + "version 1.2 + +# This comment should not be included +task foo {} + +# This should not be collected. +workflow foo {} # Here is a comment that should be collected. + +# This comment should be included too.", + ); + + assert!(diagnostics.is_empty()); + + let workflow = tree.root().last_child().unwrap(); + assert_eq!(workflow.kind(), SyntaxKind::WorkflowDefinitionNode); + assert_eq!( + workflow.succeeding_comments().as_ref(), + vec![ + "This comment should be included too.", + "Here is a comment that should be collected." + ] + ); + } + + #[test] + fn inline_comment() { + let (tree, diagnostics) = SyntaxTree::parse( + "version 1.2 + +# This comment should not be included +task foo {} + +# This should not be collected. +workflow foo {} # Here is a comment that should be collected. + +# This comment should not be included either.", + ); + + assert!(diagnostics.is_empty()); + + let workflow = tree.root().last_child().unwrap(); + assert_eq!(workflow.kind(), SyntaxKind::WorkflowDefinitionNode); + assert_eq!( + workflow.inline_comment().as_deref(), + Some("Here is a comment that should be collected.") + ); + } +}