From 595d3f915ab9630fe68faa8dd1e1cf0dd80027fa Mon Sep 17 00:00:00 2001 From: Shadaj Laddad Date: Tue, 3 Dec 2024 16:35:53 -0800 Subject: [PATCH] docs(hydroflow_plus): add initial docs on consistency and safety --- docs/docs/hydroflow_plus/consistency.md | 70 +++++++++++++++++++++++++ docs/docs/hydroflow_plus/stageleft.mdx | 2 +- hydroflow_plus_test/src/lib.rs | 7 +++ 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 docs/docs/hydroflow_plus/consistency.md diff --git a/docs/docs/hydroflow_plus/consistency.md b/docs/docs/hydroflow_plus/consistency.md new file mode 100644 index 00000000000..409f82cb8a1 --- /dev/null +++ b/docs/docs/hydroflow_plus/consistency.md @@ -0,0 +1,70 @@ +--- +sidebar_position: 3 +--- + +# Consistency and Safety +A key feature of Hydroflow+ is its integration with the Rust type system to highlight possible sources of inconsistent distributed behavior due to sources of non-determinism such as batching, timeouts, and message reordering. In this section, we'll walk through the consistency guarantees in Hydroflow+ and how to use the **`unsafe`** keyword as an escape hatch when introducing sources of non-determinism. + +:::note + +Our consistency and safety model is based on the POPL'25 paper [Flo: A Semantic Foundation for Progressive Stream Processing](https://arxiv.org/abs/2411.08274), which covers the formal details and proofs underlying this system. + +::: + +## Eventual Determinism +Hydroflow+ provides strong guarantees on **determinism**, the property that when provided the same inputs, the outputs of the program are always the same. Even when the inputs and outputs are streaming, we can use this property by looking at the **aggregate collection** (i.e. the result of collecting the elements of the stream into a finite collection). This makes it easy to build composable blocks of code without having to worry about runtime behavior such as batching or network delays. + +Because Hydroflow+ programs can involve network delay, we guarantee **eventual determinism**: given a set of streaming inputs which have arrived, the outputs of the program (which continuously change as inputs arrive) will **eventually** have the same _aggregate_ value. + +Again, by focusing on the _aggregate_ value rather than individual outputs, Hydroflow+ programs can involve concepts such as retractions (for incremental computation) while still guaranteeing determinism because the _resolved_ output (after processing retractions) will eventually be the same. + +:::note + +Much existing literature in distributed systems focuses on consistency levels such as "eventual consistency" which typically correspond to guarantees when reading the state of a _replicated_ object (or set of objects) at a _specific point_ in time. Hydroflow+ does not use such a consistency model internally, instead focusing on the values local to each distributed location _over time_. Concepts such as replication, however, can be layered on top of this model. + +::: + +## Unsafe Operations in Hydroflow+ +All **safe** APIs in Hydroflow+ (the ones you can call regularly in Rust), guarantee determinism. But oftentimes it is necessary to do something non-deterministic, like generate events at a fixed time interval or split an input into arbitrarily sized batches. + +Hydroflow+ offers APIs for such concepts behind an **`unsafe`** guard. This keyword is typically used to mark Rust functions that may not be memory-safe, but we reuse this in Hydroflow+ to mark non-deterministic APIs. + +To call such an API, the Rust compiler will ask you to wrap the call in an `unsafe` block. It is typically good practice to also include a `// SAFETY: ...` comment to explain why the non-determinism is there. + +```rust,no_run +# use hydroflow_plus::*; +# let flow = FlowBuilder::new(); +# let stream_inputs = flow.process::<()>().source_iter(q!([123])); +use std::time::Duration; + +unsafe { + // SAFETY: intentional non-determinism + stream_inputs + .sample_every(q!(Duration::from_secs(1))) +}.for_each(q!(|v| println!("Sample: {:?}", v))) +``` + +When writing a function with Hydroflow+ that involves `unsafe` code, it is important to be extra careful about whether the non-determinism is exposed externally. In some applications, a utility function may involve local non-determinism (such as sending retries), but not expose it outside the function (via deduplication). + +But other utilities may expose the non-determinism, in which case they should be marked `unsafe` as well. If the function is public, Rust will require you to put a `# Safety` section in its documentation explain the non-determinism. + +```rust +# use hydroflow_plus::*; +use std::fmt::Debug; +use std::time::Duration; + +/// ... +/// +/// # Safety +/// This function will non-deterministically print elements +/// from the stream according to a timer. +unsafe fn print_samples( + stream: Stream, Unbounded> +) { + unsafe { + // SAFETY: documented non-determinism + stream + .sample_every(q!(Duration::from_secs(1))) + }.for_each(q!(|v| println!("Sample: {:?}", v))) +} +``` diff --git a/docs/docs/hydroflow_plus/stageleft.mdx b/docs/docs/hydroflow_plus/stageleft.mdx index e4ac5014c9e..90e4a6bb01c 100644 --- a/docs/docs/hydroflow_plus/stageleft.mdx +++ b/docs/docs/hydroflow_plus/stageleft.mdx @@ -1,6 +1,6 @@ --- title: Stageleft -sidebar_position: 3 +sidebar_position: 4 --- import StageleftDocs from '../../../stageleft/README.md' diff --git a/hydroflow_plus_test/src/lib.rs b/hydroflow_plus_test/src/lib.rs index dac60be739d..032bdb88207 100644 --- a/hydroflow_plus_test/src/lib.rs +++ b/hydroflow_plus_test/src/lib.rs @@ -2,3 +2,10 @@ stageleft::stageleft_no_entry_crate!(); pub mod cluster; pub mod distributed; + +#[doc(hidden)] +#[stageleft::runtime] +pub mod docs { + #[doc = include_str!("../../docs/docs/hydroflow_plus/consistency.md")] + mod consistency {} +}