Skip to content

Commit

Permalink
Support backtracking and Checkpoints across nodes (#68)
Browse files Browse the repository at this point in the history
* Add [`GreenTreeBuilder::revert`] to support backtracking parsers

Rowan, and hence CSTree, is designed around hand-written parsers.
In particular, the APIs for *building* trees require that each token is recorded only once.

Some parsers, and especially parser combinators, use backtracking instead, where the same token may be seen multiple times.
To support this, add a new `revert` function which discards all tokens seen since the last checkpoint.

* allow checkpointing across nodes (within reason)

* clean up asserts and expand documentation

* add Changelog entry

* prepare v0.12.2 release

---------

Co-authored-by: jyn <github@jyn.dev>
Co-authored-by: Domenic Quirl <DomenicQuirl@protonmail.com>
  • Loading branch information
3 people authored Nov 1, 2024
1 parent 7f85158 commit d602fe2
Show file tree
Hide file tree
Showing 8 changed files with 489 additions and 21 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

## Unreleased

## `v0.12.2`

* `Checkpoint`s for the `GreenNodeBuilder` can now be used across node boundaries, meaning you can use them to wrap (finished) nodes in addition to just tokens.
* A new method `Checkpoint::revert_to` has been added which resets a `GreenNodeBuilder` to the state it was in when the checkpoint was taken, allowing a parser to backtrack to the checkpoint.

## `v0.12.1`

* Implement `Hash` and `Eq` for `ResolvedNode` and `ResolvedToken`

## `v0.12.0`
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ resolver = "2"

[workspace.package]
edition = "2021"
version = "0.12.1" # when updating, also update `#![doc(html_root_url)]` and any inter-crate dependencies (such as `cstree`'s dependency on `cstree-derive`)
version = "0.12.2" # when updating, also update `#![doc(html_root_url)]` and any inter-crate dependencies (such as `cstree`'s dependency on `cstree-derive`)
authors = [
"Domenic Quirl <DomenicQuirl@pm.me>",
"Aleksey Kladov <aleksey.kladov@gmail.com>",
Expand Down
2 changes: 1 addition & 1 deletion cstree/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ indexmap = "2.4.0"

[dependencies.cstree_derive]
path = "../cstree-derive"
version = "0.12.1" # must match the `cstree` version in the virtual workspace manifest
version = "0.12.2" # must match the `cstree` version in the virtual workspace manifest
optional = true

[dependencies.lasso]
Expand Down
148 changes: 135 additions & 13 deletions cstree/src/green/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,10 @@ where

/// A checkpoint for maybe wrapping a node. See [`GreenNodeBuilder::checkpoint`] for details.
#[derive(Clone, Copy, Debug)]
pub struct Checkpoint(usize);
pub struct Checkpoint {
parent_idx: usize,
child_idx: usize,
}

/// A builder for green trees.
/// Construct with [`new`](GreenNodeBuilder::new), [`with_cache`](GreenNodeBuilder::with_cache), or
Expand Down Expand Up @@ -442,10 +445,12 @@ where
self.children.push(node.into());
}

/// Prepare for maybe wrapping the next node with a surrounding node.
/// Take a snapshot of the builder's state, which can be used to retroactively insert surrounding nodes by calling
/// [`start_node_at`](GreenNodeBuilder::start_node_at), or for backtracking by allowing to
/// [`revert_to`](GreenNodeBuilder::revert_to) the returned checkpoint.
///
/// The way wrapping works is that you first get a checkpoint, then you add nodes and tokens as
/// normal, and then you *maybe* call [`start_node_at`](GreenNodeBuilder::start_node_at).
/// Checkpoints are _invalidated_ and can no longer be used if nodes older than (started before) the checkpoint are
/// finished, and also if the builder is reverted past them to an even earlier checkpoint.
///
/// # Examples
/// ```
Expand All @@ -469,27 +474,144 @@ where
/// ```
#[inline]
pub fn checkpoint(&self) -> Checkpoint {
Checkpoint(self.children.len())
Checkpoint {
parent_idx: self.parents.len(),
child_idx: self.children.len(),
}
}

/// Wrap the previous branch marked by [`checkpoint`](GreenNodeBuilder::checkpoint) in a new
/// branch and make it current.
/// Restore the builder's state to when the [`Checkpoint`] was taken.
///
/// This is useful for backtracking parsers. It will delete any nodes that were started but not finished since the
/// checkpoint was taken, as well as any tokens that were recorded since. Checkpoints can be nested, but reverting
/// to a checkpoint invalidates all other checkpoints that were taken after it.
///
/// ## Panics
///
/// When using checkpoints, calls to [`start_node`](GreenNodeBuilder::start_node) and
/// [`finish_node`](GreenNodeBuilder::finish_node) must always be balanced in between the call
/// to [`checkpoint`](GreenNodeBuilder::checkpoint) and a call to either `revert_to` or
/// [`start_node_at`](GreenNodeBuilder::start_node_at).
/// Since it is impossible to revert the finishing of any node started before taking the checkpoint, this function
/// will panic if it detects that this is the case.
///
/// Example:
/// ```rust
/// # use cstree::testing::*;
/// # use cstree::build::GreenNodeBuilder;
/// # struct Parser;
/// # impl Parser {
/// # fn peek(&self) -> Option<TestSyntaxKind> { None }
/// # fn parse_expr(&mut self) {}
/// # }
/// # let mut builder: GreenNodeBuilder<MySyntax> = GreenNodeBuilder::new();
/// # let mut parser = Parser;
/// let checkpoint = builder.checkpoint();
/// parser.parse_expr();
/// if let Some(Plus) = parser.peek() {
/// // 1 + 2 = Add(1, 2)
/// builder.start_node_at(checkpoint, Operation);
/// parser.parse_expr();
/// builder.finish_node();
/// } else {
/// builder.revert_to(checkpoint);
/// }
/// ```
pub fn revert_to(&mut self, checkpoint: Checkpoint) {
let Checkpoint { parent_idx, child_idx } = checkpoint;
// NOTE: This doesn't catch scenarios where we've read more tokens since the previous revert
// (see test `misuse3`), but it'll catch simple cases of reverting to the past and then
// trying to revert to the future (see tests `misuse` and `misuse2`).
//
// See `start_node_at` for a general explanation of the conditions here. As opposed to `start_node_at`,
// unfinished nodes are fine here (since we're going to revert them), so a `parent_idx` less than
// `self.parents.len()` is valid.
assert!(
parent_idx <= self.parents.len(),
"checkpoint no longer valid, was `finish_node` called early or did you already `revert_to`?"
);
assert!(
child_idx <= self.children.len(),
"checkpoint no longer valid after reverting to an earlier checkpoint"
);
if let Some(&(_, first_child)) = self.parents.last() {
assert!(
child_idx >= first_child,
"checkpoint no longer valid, was an unmatched start_node_at called?"
);
}

self.parents.truncate(parent_idx);
self.children.truncate(child_idx);
}

/// Start a node at the given [`checkpoint`](GreenNodeBuilder::checkpoint), wrapping all nodes
/// and tokens created since the checkpoint was taken.
///
/// ## Panics
///
/// When using checkpoints, calls to [`start_node`](GreenNodeBuilder::start_node) and
/// [`finish_node`](GreenNodeBuilder::finish_node) must always be balanced in between the call
/// to [`checkpoint`](GreenNodeBuilder::checkpoint) and a call to either
/// [`revert_to`](GreenNodeBuilder::revert_to) or `start_node_at`:
///
/// It is impossible to start a node retroactively if one or more nodes started before taking
/// the checkpoint have already been finished when trying to call this function. This function
/// will panic if it detects that this is the case.
///
/// This function will also panic if one or more nodes were started _after_ the checkpoint was
/// taken and were not yet finished, which would leave these nodes simultaneously inside and
/// outside the newly started node (neither node would contain the other).
#[inline]
pub fn start_node_at(&mut self, checkpoint: Checkpoint, kind: S) {
let Checkpoint(checkpoint) = checkpoint;
let Checkpoint { parent_idx, child_idx } = checkpoint;
// NOTE: Due to the requirement to balance `start_node` and `finish_node` in between the calls to `checkpoint`
// and this function, there must be exactly the same number of `self.parents` now as there were when the
// checkpoint was created (`start_node` creates a new parent and `finish_node` moves it to `self.children`).
//
// We assert both inequalities separately to give better error messages: if the checkpoint points past the end
// of `self.parents`, this indicates that a node outside the scope of the checkpoint was already finished (or
// reverted), while there being parents past the checkpointed index indicates that new nodes were started that
// were left unfinished.
assert!(
parent_idx <= self.parents.len(),
"checkpoint no longer valid, was `finish_node` called early or did you already `revert_to`?"
);
assert!(
parent_idx >= self.parents.len(),
"checkpoint contains one or more unfinished nodes"
);

// NOTE: The checkpoint can only point past the end of children if there were children on the stack when it was
// taken that got removed before it was used. Since any nodes started after taking the checkpoint will
// point to a `first_child` >= the checkpoint's `child_idx`, they cannot remove such children when they
// are finished. The are only two ways to remove them:
// 1. to finish a node which was already started at the time of taking the checkpoint and which may thus point
// to an earlier child as its `first_child`, or
// 2. to `revert_to` an earlier checkpoint before trying to start a new node at this one.
//
// Since we require calls to `start_node` and `finish_node` to be balanced between taking and using the
// checkpoint, (1) will be caught by the assert against the `parent_idx` above. This will also catch (2) in
// cases where at least one node is started between the earlier checkpoint and this one. However, it may be the
// case that only tokens are recorded in between the two, in which case we need to check the `child_idx` too.
assert!(
checkpoint <= self.children.len(),
"checkpoint no longer valid, was finish_node called early?"
child_idx <= self.children.len(),
"checkpoint no longer valid after reverting to an earlier checkpoint"
);

// NOTE: When `start_node` is called after `checkpoint`, the new node gets pushed to `parents`.
// However, `start_node` and `finish_node` must be balanced inbetween the calls to `checkpoint` and
// `start_node_at`, and `finish_node` causes the topmost parent to become a child instead.
// Therefore, the topmost parent _at the time of calling `start_node_at`_ must still be pointing
// "behind" the checkpoint in the list of children.
if let Some(&(_, first_child)) = self.parents.last() {
assert!(
checkpoint >= first_child,
"checkpoint no longer valid, was an unmatched start_node_at called?"
child_idx >= first_child,
"checkpoint no longer valid, was an unmatched `start_node` or `start_node_at` called?"
);
}

self.parents.push((kind, checkpoint));
self.parents.push((kind, child_idx));
}

/// Complete building the tree.
Expand Down
2 changes: 1 addition & 1 deletion cstree/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
)]
#![warn(missing_docs)]
// Docs.rs
#![doc(html_root_url = "https://docs.rs/cstree/0.12.1")]
#![doc(html_root_url = "https://docs.rs/cstree/0.12.2")]
#![cfg_attr(doc_cfg, feature(doc_cfg))]

pub mod getting_started;
Expand Down
35 changes: 34 additions & 1 deletion cstree/tests/it/main.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
mod basic;
mod regressions;
mod rollback;
mod sendsync;
#[cfg(feature = "serialize")]
mod serde;

use cstree::{
build::{GreenNodeBuilder, NodeCache},
green::GreenNode,
interning::Interner,
interning::{Interner, Resolver},
util::NodeOrToken,
RawSyntaxKind, Syntax,
};

Expand Down Expand Up @@ -78,3 +80,34 @@ where
}
from
}

#[track_caller]
pub fn assert_tree_eq(
(left, left_res): (&SyntaxNode, &impl Resolver),
(right, right_res): (&SyntaxNode, &impl Resolver),
) {
if left.green() == right.green() {
return;
}

if left.kind() != right.kind() || left.children_with_tokens().len() != right.children_with_tokens().len() {
panic!("{} !=\n{}", left.debug(left_res, true), right.debug(right_res, true))
}

for elem in left.children_with_tokens().zip(right.children_with_tokens()) {
match elem {
(NodeOrToken::Node(ln), NodeOrToken::Node(rn)) => assert_tree_eq((ln, left_res), (rn, right_res)),
(NodeOrToken::Node(n), NodeOrToken::Token(t)) => {
panic!("{} != {}", n.debug(left_res, true), t.debug(right_res))
}
(NodeOrToken::Token(t), NodeOrToken::Node(n)) => {
panic!("{} != {}", t.debug(left_res), n.debug(right_res, true))
}
(NodeOrToken::Token(lt), NodeOrToken::Token(rt)) => {
if lt.syntax_kind() != rt.syntax_kind() || lt.resolve_text(left_res) != rt.resolve_text(right_res) {
panic!("{} != {}", lt.debug(left_res), rt.debug(right_res))
}
}
}
}
}
Loading

0 comments on commit d602fe2

Please sign in to comment.