From 082bae56294a5efdb623e6e34f59c4f306f8b321 Mon Sep 17 00:00:00 2001 From: Jakob Meier Date: Fri, 22 Mar 2024 16:51:30 +0100 Subject: [PATCH 1/5] draft of docs for decisions behind nep-539 --- docs/SUMMARY.md | 1 + docs/architecture/how/base_flow_network.dot | 59 ++++++ docs/architecture/how/base_flow_network.svg | 151 +++++++++++++++ docs/architecture/how/receipt-congestion.md | 172 ++++++++++++++++++ .../how/receipt_flow_example_0.dot | 70 +++++++ .../how/receipt_flow_example_0.svg | 136 ++++++++++++++ .../how/receipt_flow_example_1.dot | 71 ++++++++ .../how/receipt_flow_example_1.svg | 143 +++++++++++++++ 8 files changed, 803 insertions(+) create mode 100644 docs/architecture/how/base_flow_network.dot create mode 100644 docs/architecture/how/base_flow_network.svg create mode 100644 docs/architecture/how/receipt-congestion.md create mode 100644 docs/architecture/how/receipt_flow_example_0.dot create mode 100644 docs/architecture/how/receipt_flow_example_0.svg create mode 100644 docs/architecture/how/receipt_flow_example_1.dot create mode 100644 docs/architecture/how/receipt_flow_example_1.svg diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index fbebca2390b..a3d64dad592 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -13,6 +13,7 @@ - [Transactions And Receipts](./architecture/how/tx_receipts.md) - [Cross shard transactions - deep dive](./architecture/how/cross-shard.md) - [Gas](./architecture/how/gas.md) + - [Receipt Congestion](./architecture/how/receipt-congestion.md) - [Meta transactions](./architecture/how/meta-tx.md) - [Serialization: Borsh, Json, ProtoBuf](./architecture/how/serialization.md) - [Proofs](./architecture/how/proofs.md) diff --git a/docs/architecture/how/base_flow_network.dot b/docs/architecture/how/base_flow_network.dot new file mode 100644 index 00000000000..763f9b597f7 --- /dev/null +++ b/docs/architecture/how/base_flow_network.dot @@ -0,0 +1,59 @@ +digraph G { + rankdir=LR; + + // Nodes + subgraph cluster_shard1 { + label = "Shard 1"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source1 [label="Source"]; + Shard1 [label="Shard"]; + Sink1 [label="Sink"]; + } + + subgraph cluster_shard2 { + label = "Shard 2"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source2 [label="Source"]; + Shard2 [label="Shard"]; + Sink2 [label="Sink"]; + } + + subgraph cluster_shard3 { + label = "Shard 3"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source3 [label="Source"]; + Shard3 [label="Shard"]; + Sink3 [label="Sink"]; + } + + // Edges + Source1 -> Shard1 [label="∞"]; + Shard1 -> Sink1 [label="1000 Tgas"]; + + Source2 -> Shard2 [label="∞"]; + Shard2 -> Sink2 [label="1000 Tgas"]; + + Source3 -> Shard3 [label="∞"]; + Shard3 -> Sink3 [label="1000 Tgas"]; + + Shard1 -> Shard2 [label="∞"]; + Shard2 -> Shard1 [label="∞"]; + Shard1 -> Shard3 [label="∞"]; + Shard3 -> Shard1 [label="∞"]; + Shard2 -> Shard3 [label="∞"]; + Shard3 -> Shard2 [label="∞"]; + + // Aligning subgraphs + {rank=same; Source1; Source2; Source3;} + {rank=same; Shard1; Shard2; Shard3;} + {rank=same; Sink1; Sink2; Sink3;} +} diff --git a/docs/architecture/how/base_flow_network.svg b/docs/architecture/how/base_flow_network.svg new file mode 100644 index 00000000000..c26d9651cab --- /dev/null +++ b/docs/architecture/how/base_flow_network.svg @@ -0,0 +1,151 @@ + + + + + + +G + + + +Source1 + +Source + + + +Shard1 + +Shard + + + +Source1->Shard1 + + + + + + +Sink1 + +Sink + + + +Shard1->Sink1 + + +1000 Tgas + + + +Shard2 + +Shard + + + +Shard1->Shard2 + + + + + + +Shard3 + +Shard + + + +Shard1->Shard3 + + + + + + +Source2 + +Source + + + +Source2->Shard2 + + + + + + +Shard2->Shard1 + + + + + + +Sink2 + +Sink + + + +Shard2->Sink2 + + +1000 Tgas + + + +Shard2->Shard3 + + + + + + +Source3 + +Source + + + +Source3->Shard3 + + + + + + +Shard3->Shard1 + + + + + + +Shard3->Shard2 + + + + + + +Sink3 + +Sink + + + +Shard3->Sink3 + + +1000 Tgas + + + diff --git a/docs/architecture/how/receipt-congestion.md b/docs/architecture/how/receipt-congestion.md new file mode 100644 index 00000000000..3966782f8a1 --- /dev/null +++ b/docs/architecture/how/receipt-congestion.md @@ -0,0 +1,172 @@ +# Receipt Congestion + +Near Protocol executes transactions in multiple steps, or receipts. Once a +transaction is accepted, the system has committed to finish all those receipts +even if it does not know ahead of time how many receipts there will be or on +which shards they will execute. + +This naturally leads to the problem that if shards just keep accepting more +transactions, we might accept workload at a higher rate than we can execute. + +## Cross-shard congestion as flow problem + +For a quick formalized discussion on congestion, let us model the Near Protocol +transaction execution as a [flow +network](https://en.wikipedia.org/wiki/Flow_network). + +Each shard has a source that accepts new transactions and a sink for burning +receipts. The flow is measured in gas. Edges to sinks have a capacity of 1000 +Tgas. (Technically, it should be 1300 but let's keep it simple for this +discussion.) + +![graph](./base_flow_network.svg) + +The edges between shards are not limited in this model. In reality, we are +eventually limited by the receipt sizes and what we can send within a block time +through the network links. But if we only look at that limit, we can send very +many receipts with a lot of gas attached to them. Thus, the model considers it +unlimited. + +Okay, we have the capacities of the network modeled. Now let's look how a +receipt execution maps onto it. + +Let's say a receipt starts at shard 1 with 300 Tgas. While executing, it burns 100 Tgas and +creates an outgoing receipts with 200 Tgas to another shard. We can represent this in the flow network with +100 Tgas to the sink of shard 1 and 200 Tgas to shard 2. + +![graph](./receipt_flow_example_0.svg) + +Note: The graph includes the execution of the next block with the 200 Tgas to th +sink of shard 2. This should be interpreted as if we continue sending the exact +same workload on all shards every block. Then we reach this steady state where +we continue to have these gas assignments per edge. + +Now we can do som flow analysis. It is immediately obvious that the total +outflow per is limited to N * 1000 Tgas but the incoming flow is unlimited. + +For a finite amount of time, we can accept more inflow than outflow, we just have to add buffers to store what we cannot execute, yet. But to stay within finite memory requirements, we need to fall back to a flow diagram where outflows are greater or equal to inflows within a finite time frame. + +Next, we look at a ideas one at the time before combining some of them into the +cross-shard congestion design proposed in +[NEP-539](https://github.com/near/NEPs/pull/539). + +## Idea 1: Compute the minimum max-flow and stay below that limit + +One approach to solve congestion would be to never allow more work into the +system than we can execute. + +But this is not ideal Just consider this example where everybody tries to access +a contract on the same shard. + +![graph](./receipt_flow_example_1.svg) + +In this workload where everyone want to use the capacity of the same shard, the +max-flow of the system is essentially the 1000 Tgas that shard 3 can execute. No +matter how many additional shards we add, this 1000 Tgas does not increase. + +Consequently, if we want to limit inflow to be the same or lower than the +outflow, we cannot accept more than `1000 Tgas / NUM_SHARDS` of new transactions +per chunk. + +So, can we just put a constant limit on sources that's `1000 Tgas / NUM_SHARDS`? Not +really, as this limit is hardly practical. It means we limit global throughput +to that of a single shard. Then why would we do sharding in the first place? + +The sad thing is, there is no way around it in the most general case. A +congestion control strategy that does apply this limit to this workload will +always have infinitely sized queues. + +Of course, we won't give up. We are not limited to a constant capacity limit, we +can instead adjust it dynamically. We simply have to find a strategy that +detects such workload and eventually applies the required limit. + +Most of these strategies can be gamed by malicious actors and probably that +means we eventually fall back to the minimum of `1000 Tgas / NUM_SHARDS`. But we +are not trying to solve malicious cases just yet, we just want regular users to +be able to utilize the system without congestion and out-of-memory crashes. +me + +## Idea 2: Limit transactions when we use too much memory + +What if we have no limit at the source until we notice we are above the memory +threshold we are comfortable with? Then we can reduce the source capacity in +steps, potentially down to 0, until buffers are getting emptier and we use less +memory again. + +If we do that, we can decide between either applying a global limit on all +sources (allow only `1000 Tgas / NUM_SHARDS` new transactions on all shards) or +applying the limit only to transactions that go to the shard with the congestion +problem. + +The first choice is certainly safe. But it means that a single congested shard +leads to all shards slowing down, even if they could keep working faster without +ever sending receipts to the congested shard. This is a hit to utilization we +want to avoid. So let's try the second way. + +In that case we filter transactions by receiver and keep accepting transactions +that go to non-congested shards. This would work fine, if all transactions would +only have depth 1. + +But receipts produced by an accepted transaction can produce more receipts to +any other shard. Therefore, we might end up accepting more inflow that +indirectly requires bandwidth on the congested shard. + +*TODO: graphic as example* + +Crucially, when accepting a transaction, we don't know ahead of time which +shards will be affected by the full directed graph of receipts in a transaction. +So there is no easy way out here. + +## Idea 3: Apply backpressure to stop all flows to a congested shard + +On top op stopping transactions to congested shards, we can also stop receipts. +We simply put them in a buffer of the sending shard and keep them there until +the congested shard has space again for the receipts. + +The problem with this idea is that it leads to deadlocks where all receipts in +the system are waiting in outgoing buffers but cannot make progress because the +receiving shard already has too high memory usage. + +*TODO: graphic as example*`1000 Tgas / NUM_SHARDS` + +## Idea 4: Keep minimum incoming queue length to avoid deadlocks + +This is the final idea we need. To avoid deadlocks, we ensure that we can always +send receipts to a shard that has not enough work in the delayed receipts queue +already. + +Basically, the backpressure limits from idea 3 are only applied to incoming +receipts but not for the total size. + +We also decided to measure the incoming congestion level using gas rather than +bytes, because it is here to maximize utilization, not to minimize memory +consumption. And utilization is best measured in gas. If we have a queue of +100_000 Tgas waiting, even if only 1% of that is burnt in this step of the +transaction, we still have 1000 Tgas of useful work we can contribute to the +total flow. + +But it preserves the backpressure property in the sense that all shards on a +path from sources to sinks that contribute to congestion will eventually end up +with full buffers. Combined with idea 2, eventually all transactions to those +shards are rejected. All of this without affecting shards that are not on the +critical path. + +*TODO: graphic as example* + +## Putting it all together + +The proposal in [NEP-539](https://github.com/near/NEPs/pull/539) combines all +ideas 2, 3, and 4. + +We have a limit of how much memory we consider to be normal operations (for +example 500 MB). Then we stop new transaction coming in to that shard. That +alone already solves all problems with single-hop transactions. + +Then we apply backpressure for multi-hop receipts and avoid deadlocks by only +applying the backpressure when we still have enough work queued up that holding +it back cannot lead to a slowed down global throughput. + +Finally, we decided to linearly interpolate the limits, as opposed to a binary +activation. This way we don't have to be too precise in finding the right +parameters, as the system should balance itself around a specific limit that +works for each workload. \ No newline at end of file diff --git a/docs/architecture/how/receipt_flow_example_0.dot b/docs/architecture/how/receipt_flow_example_0.dot new file mode 100644 index 00000000000..8906eb9922f --- /dev/null +++ b/docs/architecture/how/receipt_flow_example_0.dot @@ -0,0 +1,70 @@ +digraph G { + rankdir=LR; + + // Invisible nodes for alignment + { + node [shape=point, width=0, height=0, label=""]; + invisible_sink1; + invisible_sink2; + invisible_sink3; + } + + // Nodes + subgraph cluster_shard1 { + label = "Shard 1"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source1 [label="Source"]; + Shard1 [label="Shard 1"]; + Sink1 [label="Sink"]; + + Sink1 -> invisible_sink1 [style=invis]; + } + + subgraph cluster_shard2 { + label = "Shard 2"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source2 [label="Source"]; + Shard2 [label="Shard 2"]; + Sink2 [label="Sink"]; + + invisible_sink1 -> invisible_sink2 [style=invis]; + Sink2 -> invisible_sink2 [style=invis]; + } + + subgraph cluster_shard3 { + label = "Shard 3"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source3 [label="Source"]; + Shard3 [label="Shard 3"]; + Sink3 [label="Sink"]; + + invisible_sink2 -> invisible_sink3 [style=invis]; + Sink3 -> invisible_sink3 [style=invis]; + } + + // Edges + Source1 -> Shard1 [label="300 Tgas / ∞" color="green" penwidth=3]; + Shard1 -> Sink1 [label="100 Tgas / 1000 Tgas" color="green" penwidth=3]; + + Source2 -> Shard2 [label="0 / ∞"]; + Shard2 -> Sink2 [label="200 Tgas / 1000 Tgas" color="green" penwidth=3]; + + Source3 -> Shard3 [label="0 / ∞"]; + Shard3 -> Sink3 [label="0 / 1000 Tgas"]; + + Shard1 -> Shard2 [label="200 Tgas / ∞" color="green" penwidth=3]; + + // Aligning subgraphs + {rank=same; Source1; Source2; Source3;} + {rank=same; Shard1; Shard2; Shard3;} + {rank=same; Sink1; Sink2; Sink3; invisible_sink1; invisible_sink2; invisible_sink3;} +} diff --git a/docs/architecture/how/receipt_flow_example_0.svg b/docs/architecture/how/receipt_flow_example_0.svg new file mode 100644 index 00000000000..1db22ad9d80 --- /dev/null +++ b/docs/architecture/how/receipt_flow_example_0.svg @@ -0,0 +1,136 @@ + + + + + + +G + + + +invisible_sink1 + + + + +invisible_sink2 + + + + + +invisible_sink3 + + + + + +Source1 + +Source + + + +Shard1 + +Shard 1 + + + +Source1->Shard1 + + +300 Tgas / ∞ + + + +Sink1 + +Sink + + + +Shard1->Sink1 + + +100 Tgas / 1000 Tgas + + + +Shard2 + +Shard 2 + + + +Shard1->Shard2 + + +200 Tgas / ∞ + + + + +Source2 + +Source + + + +Source2->Shard2 + + +0 / ∞ + + + +Sink2 + +Sink + + + +Shard2->Sink2 + + +200 Tgas / 1000 Tgas + + + + +Source3 + +Source + + + +Shard3 + +Shard 3 + + + +Source3->Shard3 + + +0 / ∞ + + + +Sink3 + +Sink + + + +Shard3->Sink3 + + +0 / 1000 Tgas + + + + diff --git a/docs/architecture/how/receipt_flow_example_1.dot b/docs/architecture/how/receipt_flow_example_1.dot new file mode 100644 index 00000000000..4f5b5e83ba7 --- /dev/null +++ b/docs/architecture/how/receipt_flow_example_1.dot @@ -0,0 +1,71 @@ +digraph G { + rankdir=LR; + + // Invisible nodes for alignment + { + node [shape=point, width=0, height=0, label=""]; + invisible_sink1; + invisible_sink2; + invisible_sink3; + } + + // Nodes + subgraph cluster_shard1 { + label = "Shard 1"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source1 [label="Source"]; + Shard1 [label="Shard 1"]; + Sink1 [label="Sink"]; + + Sink1 -> invisible_sink1 [style=invis]; + } + + subgraph cluster_shard2 { + label = "Shard 2"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source2 [label="Source"]; + Shard2 [label="Shard 2"]; + Sink2 [label="Sink"]; + + invisible_sink1 -> invisible_sink2 [style=invis]; + Sink2 -> invisible_sink2 [style=invis]; + } + + subgraph cluster_shard3 { + label = "Shard 3"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source3 [label="Source"]; + Shard3 [label="Shard 3"]; + Sink3 [label="Sink"]; + + invisible_sink2 -> invisible_sink3 [style=invis]; + Sink3 -> invisible_sink3 [style=invis]; + } + + // Edges + Source1 -> Shard1 [label="1000 Tgas"]; + Shard1 -> Sink1 [label="1 Tgas / 1000 Tgas"]; + + Source2 -> Shard2 [label="1000 Tgas"]; + Shard2 -> Sink2 [label="1 Tgas / 1000 Tgas"]; + + Source3 -> Shard3 [label="1000 Tgas"]; + Shard3 -> Sink3 [label="2998 Tgas / 1000 Tgas" color="red" penwidth=3]; + + Shard1 -> Shard3 [label="999 Tgas"]; + Shard2 -> Shard3 [label="999 Tgas"]; + + // Aligning subgraphs + {rank=same; Source1; Source2; Source3;} + {rank=same; Shard1; Shard2; Shard3;} + {rank=same; Sink1; Sink2; Sink3; invisible_sink1; invisible_sink2; invisible_sink3;} +} diff --git a/docs/architecture/how/receipt_flow_example_1.svg b/docs/architecture/how/receipt_flow_example_1.svg new file mode 100644 index 00000000000..0b64e6ec9f8 --- /dev/null +++ b/docs/architecture/how/receipt_flow_example_1.svg @@ -0,0 +1,143 @@ + + + + + + +G + + + +invisible_sink1 + + + + +invisible_sink2 + + + + + +invisible_sink3 + + + + + +Source1 + +Source + + + +Shard1 + +Shard 1 + + + +Source1->Shard1 + + +1000 Tgas + + + +Sink1 + +Sink + + + +Shard1->Sink1 + + +1 Tgas / 1000 Tgas + + + +Shard3 + +Shard 3 + + + +Shard1->Shard3 + + +999 Tgas + + + + +Source2 + +Source + + + +Shard2 + +Shard 2 + + + +Source2->Shard2 + + +1000 Tgas + + + +Sink2 + +Sink + + + +Shard2->Sink2 + + +1 Tgas / 1000 Tgas + + + +Shard2->Shard3 + + +999 Tgas + + + + +Source3 + +Source + + + +Source3->Shard3 + + +1000 Tgas + + + +Sink3 + +Sink + + + +Shard3->Sink3 + + +2998 Tgas / 1000 Tgas + + + + From 5a7cf41789c6e84e6266df13beb479db286322fc Mon Sep 17 00:00:00 2001 From: Jakob Meier Date: Mon, 25 Mar 2024 14:51:14 +0100 Subject: [PATCH 2/5] add more graphs and improve explanations --- docs/architecture/how/receipt-congestion.md | 87 ++++-- .../congestion}/base_flow_network.dot | 0 .../congestion}/base_flow_network.svg | 0 .../congestion}/receipt_flow_example_0.dot | 0 .../congestion}/receipt_flow_example_0.svg | 0 .../congestion}/receipt_flow_example_1.dot | 0 .../congestion}/receipt_flow_example_1.svg | 0 .../congestion/receipt_flow_example_1_1.dot | 71 +++++ .../congestion/receipt_flow_example_1_1.svg | 143 ++++++++++ .../congestion/receipt_flow_example_2.dot | 115 ++++++++ .../congestion/receipt_flow_example_2.svg | 241 ++++++++++++++++ .../congestion/receipt_flow_example_3.dot | 122 ++++++++ .../congestion/receipt_flow_example_3.svg | 268 ++++++++++++++++++ .../congestion/receipt_flow_example_3_1.dot | 67 +++++ .../congestion/receipt_flow_example_3_1.svg | 133 +++++++++ .../congestion/receipt_flow_example_4.dot | 83 ++++++ .../congestion/receipt_flow_example_4.svg | 177 ++++++++++++ 17 files changed, 1477 insertions(+), 30 deletions(-) rename docs/{architecture/how => images/congestion}/base_flow_network.dot (100%) rename docs/{architecture/how => images/congestion}/base_flow_network.svg (100%) rename docs/{architecture/how => images/congestion}/receipt_flow_example_0.dot (100%) rename docs/{architecture/how => images/congestion}/receipt_flow_example_0.svg (100%) rename docs/{architecture/how => images/congestion}/receipt_flow_example_1.dot (100%) rename docs/{architecture/how => images/congestion}/receipt_flow_example_1.svg (100%) create mode 100644 docs/images/congestion/receipt_flow_example_1_1.dot create mode 100644 docs/images/congestion/receipt_flow_example_1_1.svg create mode 100644 docs/images/congestion/receipt_flow_example_2.dot create mode 100644 docs/images/congestion/receipt_flow_example_2.svg create mode 100644 docs/images/congestion/receipt_flow_example_3.dot create mode 100644 docs/images/congestion/receipt_flow_example_3.svg create mode 100644 docs/images/congestion/receipt_flow_example_3_1.dot create mode 100644 docs/images/congestion/receipt_flow_example_3_1.svg create mode 100644 docs/images/congestion/receipt_flow_example_4.dot create mode 100644 docs/images/congestion/receipt_flow_example_4.svg diff --git a/docs/architecture/how/receipt-congestion.md b/docs/architecture/how/receipt-congestion.md index 3966782f8a1..36b9be409f4 100644 --- a/docs/architecture/how/receipt-congestion.md +++ b/docs/architecture/how/receipt-congestion.md @@ -19,7 +19,7 @@ receipts. The flow is measured in gas. Edges to sinks have a capacity of 1000 Tgas. (Technically, it should be 1300 but let's keep it simple for this discussion.) -![graph](./base_flow_network.svg) +![graph](../../images/congestion/base_flow_network.svg) The edges between shards are not limited in this model. In reality, we are eventually limited by the receipt sizes and what we can send within a block time @@ -34,7 +34,7 @@ Let's say a receipt starts at shard 1 with 300 Tgas. While executing, it burns 1 creates an outgoing receipts with 200 Tgas to another shard. We can represent this in the flow network with 100 Tgas to the sink of shard 1 and 200 Tgas to shard 2. -![graph](./receipt_flow_example_0.svg) +![graph](../../images/congestion/receipt_flow_example_0.svg) Note: The graph includes the execution of the next block with the 200 Tgas to th sink of shard 2. This should be interpreted as if we continue sending the exact @@ -58,7 +58,7 @@ system than we can execute. But this is not ideal Just consider this example where everybody tries to access a contract on the same shard. -![graph](./receipt_flow_example_1.svg) +![graph](../../images/congestion/receipt_flow_example_1.svg) In this workload where everyone want to use the capacity of the same shard, the max-flow of the system is essentially the 1000 Tgas that shard 3 can execute. No @@ -68,6 +68,8 @@ Consequently, if we want to limit inflow to be the same or lower than the outflow, we cannot accept more than `1000 Tgas / NUM_SHARDS` of new transactions per chunk. +![graph](../../images/congestion/receipt_flow_example_1_1.svg) + So, can we just put a constant limit on sources that's `1000 Tgas / NUM_SHARDS`? Not really, as this limit is hardly practical. It means we limit global throughput to that of a single shard. Then why would we do sharding in the first place? @@ -81,10 +83,12 @@ can instead adjust it dynamically. We simply have to find a strategy that detects such workload and eventually applies the required limit. Most of these strategies can be gamed by malicious actors and probably that -means we eventually fall back to the minimum of `1000 Tgas / NUM_SHARDS`. But we -are not trying to solve malicious cases just yet, we just want regular users to -be able to utilize the system without congestion and out-of-memory crashes. -me +means we eventually fall back to the minimum of `1000 Tgas / NUM_SHARDS`. But at +this stage our ambition isn't to have 100% utilization under all malicious +cases. We are instead trying to find a solution that can give 100% utilization +for normal operation and then falls back to `1000 Tgas / NUM_SHARDS` when it has +to, in order to prevent out-of-memory crashes. + ## Idea 2: Limit transactions when we use too much memory @@ -94,9 +98,9 @@ steps, potentially down to 0, until buffers are getting emptier and we use less memory again. If we do that, we can decide between either applying a global limit on all -sources (allow only `1000 Tgas / NUM_SHARDS` new transactions on all shards) or -applying the limit only to transactions that go to the shard with the congestion -problem. +sources (allow only `1000 Tgas / NUM_SHARDS` new transactions on all shards like +in idea 1) or applying the limit only to transactions that go to the shard with +the congestion problem. The first choice is certainly safe. But it means that a single congested shard leads to all shards slowing down, even if they could keep working faster without @@ -111,23 +115,29 @@ But receipts produced by an accepted transaction can produce more receipts to any other shard. Therefore, we might end up accepting more inflow that indirectly requires bandwidth on the congested shard. -*TODO: graphic as example* +![graph](../../images/congestion/receipt_flow_example_2.svg) Crucially, when accepting a transaction, we don't know ahead of time which shards will be affected by the full directed graph of receipts in a transaction. -So there is no easy way out here. +We only know the first step For multi-jop transactions, there is no easy way out. + +But it is worth mentioning, that in practice the single-hop function call is the +most common case. And this case can be handled nicely by rejecting incoming +transactions to congested shards. ## Idea 3: Apply backpressure to stop all flows to a congested shard -On top op stopping transactions to congested shards, we can also stop receipts. +On top op stopping transactions to congested shards, we can also stop receipts if they have a congested shard as the receiver. We simply put them in a buffer of the sending shard and keep them there until the congested shard has space again for the receipts. +![graph](../../images/congestion/receipt_flow_example_3.svg) + The problem with this idea is that it leads to deadlocks where all receipts in the system are waiting in outgoing buffers but cannot make progress because the receiving shard already has too high memory usage. -*TODO: graphic as example*`1000 Tgas / NUM_SHARDS` +![graph](../../images/congestion/receipt_flow_example_3_1.svg) ## Idea 4: Keep minimum incoming queue length to avoid deadlocks @@ -136,22 +146,30 @@ send receipts to a shard that has not enough work in the delayed receipts queue already. Basically, the backpressure limits from idea 3 are only applied to incoming -receipts but not for the total size. +receipts but not for the total size. This guarantees that in congested the +scenario that previously caused a deadlock, we always have something in the +incoming queue to work on, otherwise there wouldn't be backpressure at all. + +![graph](../../images/congestion/receipt_flow_example_4.svg) -We also decided to measure the incoming congestion level using gas rather than +We decided to measure the incoming congestion level using gas rather than bytes, because it is here to maximize utilization, not to minimize memory consumption. And utilization is best measured in gas. If we have a queue of -100_000 Tgas waiting, even if only 1% of that is burnt in this step of the +10_000 Tgas waiting, even if only 10% of that is burnt in this step of the transaction, we still have 1000 Tgas of useful work we can contribute to the -total flow. +total flow. Thus under the assumption that at least 10% of gas is being burnt, +we have 100% utilization. + +Of course, we can increase the queue to have even better utility guarantees. But +it comes at the cost of longer delays for every transaction or receipt that goes +through a congested shard. -But it preserves the backpressure property in the sense that all shards on a -path from sources to sinks that contribute to congestion will eventually end up -with full buffers. Combined with idea 2, eventually all transactions to those -shards are rejected. All of this without affecting shards that are not on the -critical path. +This strategy also preserves the backpressure property in the sense that all +shards on a path from sources to sinks that contribute to congestion will +eventually end up with full buffers. Combined with idea 2, eventually all +transactions to those shards are rejected. All of this without affecting shards +that are not on the critical path. -*TODO: graphic as example* ## Putting it all together @@ -159,14 +177,23 @@ The proposal in [NEP-539](https://github.com/near/NEPs/pull/539) combines all ideas 2, 3, and 4. We have a limit of how much memory we consider to be normal operations (for -example 500 MB). Then we stop new transaction coming in to that shard. That -alone already solves all problems with single-hop transactions. +example 500 MB). Then we stop new transaction coming in to that shard but still +allow more incoming transactions to other shards if those are not congested. +That alone already solves all problems with single-hop transactions. + +In the congested shard itself, we also keep accepting transactions to other +shards. But we heavily reduce the gas allocated for new transactions, in order +to have more capacity to work on finishing the waiting receipts. This is +technically not necessary for any specific property, but it should make sense +intuitively that this helps to reduce congestion quicker and therefore lead to a +better user experience. This is why we added this feature. And our simulations +also support this intuition. Then we apply backpressure for multi-hop receipts and avoid deadlocks by only applying the backpressure when we still have enough work queued up that holding it back cannot lead to a slowed down global throughput. -Finally, we decided to linearly interpolate the limits, as opposed to a binary -activation. This way we don't have to be too precise in finding the right -parameters, as the system should balance itself around a specific limit that -works for each workload. \ No newline at end of file +Another design decision was to linearly interpolate the limits, as opposed to +binary on and off states. This way, we don't have to be too precise in finding +the right parameters, as the system should balance itself around a specific +limit that works for each workload. \ No newline at end of file diff --git a/docs/architecture/how/base_flow_network.dot b/docs/images/congestion/base_flow_network.dot similarity index 100% rename from docs/architecture/how/base_flow_network.dot rename to docs/images/congestion/base_flow_network.dot diff --git a/docs/architecture/how/base_flow_network.svg b/docs/images/congestion/base_flow_network.svg similarity index 100% rename from docs/architecture/how/base_flow_network.svg rename to docs/images/congestion/base_flow_network.svg diff --git a/docs/architecture/how/receipt_flow_example_0.dot b/docs/images/congestion/receipt_flow_example_0.dot similarity index 100% rename from docs/architecture/how/receipt_flow_example_0.dot rename to docs/images/congestion/receipt_flow_example_0.dot diff --git a/docs/architecture/how/receipt_flow_example_0.svg b/docs/images/congestion/receipt_flow_example_0.svg similarity index 100% rename from docs/architecture/how/receipt_flow_example_0.svg rename to docs/images/congestion/receipt_flow_example_0.svg diff --git a/docs/architecture/how/receipt_flow_example_1.dot b/docs/images/congestion/receipt_flow_example_1.dot similarity index 100% rename from docs/architecture/how/receipt_flow_example_1.dot rename to docs/images/congestion/receipt_flow_example_1.dot diff --git a/docs/architecture/how/receipt_flow_example_1.svg b/docs/images/congestion/receipt_flow_example_1.svg similarity index 100% rename from docs/architecture/how/receipt_flow_example_1.svg rename to docs/images/congestion/receipt_flow_example_1.svg diff --git a/docs/images/congestion/receipt_flow_example_1_1.dot b/docs/images/congestion/receipt_flow_example_1_1.dot new file mode 100644 index 00000000000..855dedcaafc --- /dev/null +++ b/docs/images/congestion/receipt_flow_example_1_1.dot @@ -0,0 +1,71 @@ +digraph G { + rankdir=LR; + + // Invisible nodes for alignment + { + node [shape=point, width=0, height=0, label=""]; + invisible_sink1; + invisible_sink2; + invisible_sink3; + } + + // Nodes + subgraph cluster_shard1 { + label = "Shard 1"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source1 [label="Source"]; + Shard1 [label="Shard 1"]; + Sink1 [label="Sink"]; + + Sink1 -> invisible_sink1 [style=invis]; + } + + subgraph cluster_shard2 { + label = "Shard 2"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source2 [label="Source"]; + Shard2 [label="Shard 2"]; + Sink2 [label="Sink"]; + + invisible_sink1 -> invisible_sink2 [style=invis]; + Sink2 -> invisible_sink2 [style=invis]; + } + + subgraph cluster_shard3 { + label = "Shard 3"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source3 [label="Source"]; + Shard3 [label="Shard 3"]; + Sink3 [label="Sink"]; + + invisible_sink2 -> invisible_sink3 [style=invis]; + Sink3 -> invisible_sink3 [style=invis]; + } + + // Edges + Source1 -> Shard1 [label="333 Tgas" color="red" penwidth=3]; + Shard1 -> Sink1 [label="1 Tgas / 1000 Tgas"]; + + Source2 -> Shard2 [label="333 Tgas" color="red" penwidth=3]; + Shard2 -> Sink2 [label="1 Tgas / 1000 Tgas"]; + + Source3 -> Shard3 [label="333 Tgas" color="red" penwidth=3]; + Shard3 -> Sink3 [label="997 Tgas / 1000 Tgas" color="green" penwidth=3]; + + Shard1 -> Shard3 [label="332 Tgas"]; + Shard2 -> Shard3 [label="332 Tgas"]; + + // Aligning subgraphs + {rank=same; Source1; Source2; Source3;} + {rank=same; Shard1; Shard2; Shard3;} + {rank=same; Sink1; Sink2; Sink3; invisible_sink1; invisible_sink2; invisible_sink3;} +} diff --git a/docs/images/congestion/receipt_flow_example_1_1.svg b/docs/images/congestion/receipt_flow_example_1_1.svg new file mode 100644 index 00000000000..aba610fcbc1 --- /dev/null +++ b/docs/images/congestion/receipt_flow_example_1_1.svg @@ -0,0 +1,143 @@ + + + + + + +G + + + +invisible_sink1 + + + + +invisible_sink2 + + + + + +invisible_sink3 + + + + + +Source1 + +Source + + + +Shard1 + +Shard 1 + + + +Source1->Shard1 + + +333 Tgas + + + +Sink1 + +Sink + + + +Shard1->Sink1 + + +1 Tgas / 1000 Tgas + + + +Shard3 + +Shard 3 + + + +Shard1->Shard3 + + +332 Tgas + + + + +Source2 + +Source + + + +Shard2 + +Shard 2 + + + +Source2->Shard2 + + +333 Tgas + + + +Sink2 + +Sink + + + +Shard2->Sink2 + + +1 Tgas / 1000 Tgas + + + +Shard2->Shard3 + + +332 Tgas + + + + +Source3 + +Source + + + +Source3->Shard3 + + +333 Tgas + + + +Sink3 + +Sink + + + +Shard3->Sink3 + + +997 Tgas / 1000 Tgas + + + + diff --git a/docs/images/congestion/receipt_flow_example_2.dot b/docs/images/congestion/receipt_flow_example_2.dot new file mode 100644 index 00000000000..81280f37ab4 --- /dev/null +++ b/docs/images/congestion/receipt_flow_example_2.dot @@ -0,0 +1,115 @@ +digraph G { + rankdir=LR; + + // Invisible nodes for alignment + { + node [shape=point, width=0, height=0, label=""]; + invisible_sink1; + invisible_sink2; + invisible_sink3; + invisible_sink4; + invisible_sink5; + } + + // Nodes + subgraph cluster_shard1 { + label = "Shard 1"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source1 [label="Source"]; + Shard1 [label="Shard 1"]; + Sink1 [label="Sink"]; + + Sink1 -> invisible_sink1 [style=invis]; + } + + subgraph cluster_shard2 { + label = "Shard 2"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source2 [label="Source"]; + Shard2 [label="Shard 2"]; + Sink2 [label="Sink"]; + + invisible_sink1 -> invisible_sink2 [style=invis]; + Source1 -> Source2 [style=invis]; + Sink2 -> invisible_sink2 [style=invis]; + } + + subgraph cluster_shard3 { + label = "Shard 3"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source3 [label="Source"]; + Shard3 [label="Shard 3"]; + Sink3 [label="Sink"]; + + invisible_sink2 -> invisible_sink3 [style=invis]; + Source2 -> Source3 [style=invis]; + Sink3 -> invisible_sink3 [style=invis]; + } + + subgraph cluster_shard4 { + label = "Shard 4"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source4 [label="Source"]; + Shard4 [label="Shard 4"]; + Sink4 [label="Sink"]; + + invisible_sink3 -> invisible_sink4 [style=invis]; + Source3 -> Source4 [style=invis]; + Shard3 -> Shard4 [style=invis]; + Sink4 -> invisible_sink4 [style=invis]; + } + + subgraph cluster_shard5 { + label = "Shard 5"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source5 [label="Source"]; + Shard5 [label="Shard 5"]; + Sink5 [label="Sink"]; + + invisible_sink4 -> invisible_sink5 [style=invis]; + Source4 -> Source5 [style=invis]; + Shard4 -> Shard5 [style=invis]; + Sink5 -> invisible_sink5 [style=invis]; + } + + // Edges + Source1 -> Shard1 [label="1000 Tgas" color="red" penwidth=3]; + Source2 -> Shard2 [label="blocked tx to shard 3" color="green" penwidth=3]; + Source3 -> Shard3 [label="blocked tx to shard 3" color="green" penwidth=3]; + Source4 -> Shard4 [label="blocked tx to shard 3" color="green" penwidth=3]; + Source5 -> Shard5 [label="1000 Tgas" color="red" penwidth=3]; + + Shard1 -> Sink1 [label="10 Tgas / 1000 Tgas"]; + Shard2 -> Sink2 [label="10 Tgas / 1000 Tgas"]; + Shard3 -> Sink3 [label="1980 Tgas / 1000 Tgas" color="red" penwidth=3]; + Shard4 -> Sink4 [label="10 Tgas / 1000 Tgas"]; + Shard5 -> Sink5 [label="10 Tgas / 1000 Tgas"]; + + + Shard1 -> Shard2 [label="990 Tgas" color="red" penwidth=2]; + Shard2 -> Shard3 [label="980 Tgas" color="red" penwidth=2]; + Shard4 -> Shard3 [label="980 Tgas" color="red" penwidth=2]; + Shard5 -> Shard4 [label="990 Tgas" color="red" penwidth=2]; + + + + // Aligning subgraphs + {rank=same; Source1; Source2; Source3; Source4; Source5;} + {rank=same; Shard1; Shard2; Shard3; Shard4; Shard5;} + {rank=same; Sink1; Sink2; Sink3; Sink4; Sink5; invisible_sink1; invisible_sink2; invisible_sink3; invisible_sink4; invisible_sink5;} +} diff --git a/docs/images/congestion/receipt_flow_example_2.svg b/docs/images/congestion/receipt_flow_example_2.svg new file mode 100644 index 00000000000..d0cfd6553ab --- /dev/null +++ b/docs/images/congestion/receipt_flow_example_2.svg @@ -0,0 +1,241 @@ + + + + + + +G + + + +invisible_sink1 + + + + +invisible_sink2 + + + + + +invisible_sink3 + + + + + +invisible_sink4 + + + + + +invisible_sink5 + + + + + +Source1 + +Source + + + +Shard1 + +Shard 1 + + + +Source1->Shard1 + + +1000 Tgas + + + +Source2 + +Source + + + + +Sink1 + +Sink + + + +Shard1->Sink1 + + +10 Tgas / 1000 Tgas + + + +Shard2 + +Shard 2 + + + +Shard1->Shard2 + + +990 Tgas + + + + +Source2->Shard2 + + +blocked tx to shard 3 + + + +Source3 + +Source + + + + +Sink2 + +Sink + + + +Shard2->Sink2 + + +10 Tgas / 1000 Tgas + + + +Shard3 + +Shard 3 + + + +Shard2->Shard3 + + +980 Tgas + + + + +Source3->Shard3 + + +blocked tx to shard 3 + + + +Source4 + +Source + + + + +Sink3 + +Sink + + + +Shard3->Sink3 + + +1980 Tgas / 1000 Tgas + + + +Shard4 + +Shard 4 + + + + + +Source4->Shard4 + + +blocked tx to shard 3 + + + +Source5 + +Source + + + + +Shard4->Shard3 + + +980 Tgas + + + +Sink4 + +Sink + + + +Shard4->Sink4 + + +10 Tgas / 1000 Tgas + + + +Shard5 + +Shard 5 + + + + + +Source5->Shard5 + + +1000 Tgas + + + +Shard5->Shard4 + + +990 Tgas + + + +Sink5 + +Sink + + + +Shard5->Sink5 + + +10 Tgas / 1000 Tgas + + + + diff --git a/docs/images/congestion/receipt_flow_example_3.dot b/docs/images/congestion/receipt_flow_example_3.dot new file mode 100644 index 00000000000..10f6e1645bf --- /dev/null +++ b/docs/images/congestion/receipt_flow_example_3.dot @@ -0,0 +1,122 @@ +digraph G { + rankdir=LR; + + // Invisible nodes for alignment + { + node [shape=point, width=0, height=0, label=""]; + invisible_sink1; + invisible_sink2; + invisible_sink3; + invisible_sink4; + invisible_sink5; + } + + // Nodes + subgraph cluster_shard1 { + label = "Shard 1"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source1 [label="Source"]; + Shard1 [label="Shard 1"]; + Sink1 [label="Sink"]; + Buffer1 [label="Outgoing Buffer fills to max" shape=box color="red" penwidth=2]; + + + Sink1 -> invisible_sink1 [style=invis]; + } + + subgraph cluster_shard2 { + label = "Shard 2"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source2 [label="Source"]; + Shard2 [label="Shard 2"]; + Sink2 [label="Sink"]; + + invisible_sink1 -> invisible_sink2 [style=invis]; + Source1 -> Source2 [style=invis]; + Sink2 -> invisible_sink2 [style=invis]; + } + + subgraph cluster_shard3 { + label = "Shard 3"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source3 [label="Source"]; + Shard3 [label="Shard 3"]; + Sink3 [label="Sink"]; + + invisible_sink2 -> invisible_sink3 [style=invis]; + Source2 -> Source3 [style=invis]; + Sink3 -> invisible_sink3 [style=invis]; + } + + subgraph cluster_shard4 { + label = "Shard 4"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source4 [label="Source"]; + Shard4 [label="Shard 4"]; + Sink4 [label="Sink"]; + + invisible_sink3 -> invisible_sink4 [style=invis]; + Source3 -> Source4 [style=invis]; + Shard3 -> Shard4 [style=invis]; + Sink4 -> invisible_sink4 [style=invis]; + } + + subgraph cluster_shard5 { + label = "Shard 5"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source5 [label="Source"]; + Shard5 [label="Shard 5"]; + Sink5 [label="Sink"]; + Buffer5 [label="Outgoing Buffer fills to max" shape=box color="red" penwidth=2]; + + invisible_sink4 -> invisible_sink5 [style=invis]; + Source4 -> Source5 [style=invis]; + Shard4 -> Shard5 [style=invis]; + Sink5 -> invisible_sink5 [style=invis]; + Buffer5 -> Sink5 [style=invis]; + } + + // Edges + Source1 -> Shard1 [label="blocked tx to shard 2" color="green" penwidth=3]; + Source2 -> Shard2 [label="blocked tx to shard 3" color="green" penwidth=3]; + Source3 -> Shard3 [label="blocked tx to shard 3" color="green" penwidth=3]; + Source4 -> Shard4 [label="blocked tx to shard 3" color="green" penwidth=3]; + Source5 -> Shard5 [label="blocked tx to shard 4" color="green" penwidth=3]; + + Shard1 -> Sink1 [label="10 Tgas / 1000 Tgas"]; + Shard2 -> Sink2 [label="10 Tgas / 1000 Tgas"]; + Shard3 -> Sink3 [label="1000 Tgas / 1000 Tgas" color="green" penwidth=3]; + Shard4 -> Sink4 [label="10 Tgas / 1000 Tgas"]; + Shard5 -> Sink5 [label="10 Tgas / 1000 Tgas"]; + + + Shard1 -> Buffer1 [label="backpressure" color="green" penwidth=2] + Buffer1 -> Shard2 [label="510 Tgas" color="green" penwidth=2]; + Shard2 -> Shard3 [label="500 Tgas" color="green" penwidth=2]; + Shard4 -> Shard3 [label="500 Tgas" color="green" penwidth=2]; + Shard5 -> Buffer5 [label="backpressure" color="green" penwidth=2]; + Buffer5 -> Shard4 [label="510 Tgas" color="green" penwidth=2]; + + + + // Aligning subgraphs + {rank=same; Source1; Source2; Source3; Source4; Source5;} + {rank=same; Shard1; Shard2; Shard3; Shard4; Shard5;} + {rank=same; Buffer1; Buffer5;} + {rank=same; Sink1; Sink2; Sink3; Sink4; Sink5; invisible_sink1; invisible_sink2; invisible_sink3; invisible_sink4; invisible_sink5;} +} diff --git a/docs/images/congestion/receipt_flow_example_3.svg b/docs/images/congestion/receipt_flow_example_3.svg new file mode 100644 index 00000000000..d2643102bde --- /dev/null +++ b/docs/images/congestion/receipt_flow_example_3.svg @@ -0,0 +1,268 @@ + + + + + + +G + + + +invisible_sink1 + + + + +invisible_sink2 + + + + + +invisible_sink3 + + + + + +invisible_sink4 + + + + + +invisible_sink5 + + + + + +Source1 + +Source + + + +Shard1 + +Shard 1 + + + +Source1->Shard1 + + +blocked tx to shard 2 + + + +Source2 + +Source + + + + +Sink1 + +Sink + + + +Shard1->Sink1 + + +10 Tgas / 1000 Tgas + + + +Buffer1 + +Outgoing Buffer fills to max + + + +Shard1->Buffer1 + + +backpressure + + + + +Shard2 + +Shard 2 + + + +Buffer1->Shard2 + + +510 Tgas + + + +Source2->Shard2 + + +blocked tx to shard 3 + + + +Source3 + +Source + + + + +Sink2 + +Sink + + + +Shard2->Sink2 + + +10 Tgas / 1000 Tgas + + + +Shard3 + +Shard 3 + + + +Shard2->Shard3 + + +500 Tgas + + + + +Source3->Shard3 + + +blocked tx to shard 3 + + + +Source4 + +Source + + + + +Sink3 + +Sink + + + +Shard3->Sink3 + + +1000 Tgas / 1000 Tgas + + + +Shard4 + +Shard 4 + + + + + +Source4->Shard4 + + +blocked tx to shard 3 + + + +Source5 + +Source + + + + +Shard4->Shard3 + + +500 Tgas + + + +Sink4 + +Sink + + + +Shard4->Sink4 + + +10 Tgas / 1000 Tgas + + + +Shard5 + +Shard 5 + + + + + +Source5->Shard5 + + +blocked tx to shard 4 + + + +Sink5 + +Sink + + + +Shard5->Sink5 + + +10 Tgas / 1000 Tgas + + + +Buffer5 + +Outgoing Buffer fills to max + + + +Shard5->Buffer5 + + +backpressure + + + + +Buffer5->Shard4 + + +510 Tgas + + + + diff --git a/docs/images/congestion/receipt_flow_example_3_1.dot b/docs/images/congestion/receipt_flow_example_3_1.dot new file mode 100644 index 00000000000..e4453d4cc73 --- /dev/null +++ b/docs/images/congestion/receipt_flow_example_3_1.dot @@ -0,0 +1,67 @@ +digraph G { + rankdir=LR; + + // Invisible nodes for alignment + { + node [shape=point, width=0, height=0, label=""]; + invisible_sink1; + invisible_sink2; + } + + // Nodes + subgraph cluster_shard1 { + label = "Shard 1"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source1 [label="Source"]; + Shard1 [label="Shard 1"]; + Sink1 [label="Sink"]; + Buffer1 [label="Outgoing Buffer fills to max" shape=box color="red" penwidth=2]; + + + Sink1 -> invisible_sink1 [style=invis]; + Buffer1 -> Sink1 [style=invis]; + } + + subgraph cluster_shard2 { + label = "Shard 2"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source2 [label="Source"]; + Shard2 [label="Shard 2"]; + Sink2 [label="Sink"]; + Buffer2 [label="Outgoing Buffer fills to max" shape=box color="red" penwidth=2]; + + invisible_sink1 -> invisible_sink2 [style=invis]; + Source1 -> Source2 [style=invis]; + Sink2 -> invisible_sink2 [style=invis]; + Buffer2 -> Sink2 [style=invis]; + } + + + // Edges + Source1 -> Shard1 [label="blockd tx to shard 1+2" color="red" penwidth=3]; + Source2 -> Shard2 [label="blocked tx to shard 1+2" color="red" penwidth=3]; + + Shard1 -> Sink1 [label="0 Tgas / 1000 Tgas (deadlock)"]; + Shard2 -> Sink2 [label="0 Tgas / 1000 Tgas (deadlock)"]; + + Shard1 -> Buffer1 [label="backpressure" color="red" penwidth=2] + Buffer1 -> Shard2 [label="backpressure" color="red" penwidth=2]; + + Shard2 -> Buffer2 [label="backpressure" color="red" penwidth=2] + Buffer2 -> Shard1 [label="backpressure" color="red" penwidth=2]; + + + + + // Aligning subgraphs + {rank=same; Source1; Source2;} + {rank=same; Shard1; Shard2;} + {rank=same; Buffer1; Buffer2;} + {rank=same; Sink1; Sink2; invisible_sink1; invisible_sink2;} +} diff --git a/docs/images/congestion/receipt_flow_example_3_1.svg b/docs/images/congestion/receipt_flow_example_3_1.svg new file mode 100644 index 00000000000..e9316ee0f73 --- /dev/null +++ b/docs/images/congestion/receipt_flow_example_3_1.svg @@ -0,0 +1,133 @@ + + + + + + +G + + + +invisible_sink1 + + + + +invisible_sink2 + + + + + +Source1 + +Source + + + +Shard1 + +Shard 1 + + + +Source1->Shard1 + + +blockd tx to shard 1+2 + + + +Source2 + +Source + + + + +Sink1 + +Sink + + + +Shard1->Sink1 + + +0 Tgas / 1000 Tgas (deadlock) + + + +Buffer1 + +Outgoing Buffer fills to max + + + +Shard1->Buffer1 + + +backpressure + + + + + +Shard2 + +Shard 2 + + + +Buffer1->Shard2 + + +backpressure + + + +Source2->Shard2 + + +blocked tx to shard 1+2 + + + +Sink2 + +Sink + + + +Shard2->Sink2 + + +0 Tgas / 1000 Tgas (deadlock) + + + +Buffer2 + +Outgoing Buffer fills to max + + + +Shard2->Buffer2 + + +backpressure + + + + +Buffer2->Shard1 + + +backpressure + + + + diff --git a/docs/images/congestion/receipt_flow_example_4.dot b/docs/images/congestion/receipt_flow_example_4.dot new file mode 100644 index 00000000000..7d8ea366064 --- /dev/null +++ b/docs/images/congestion/receipt_flow_example_4.dot @@ -0,0 +1,83 @@ +digraph G { + rankdir=LR; + + // Invisible nodes for alignment + { + node [shape=point, width=0, height=0, label=""]; + invisible_sink1; + invisible_sink2; + } + + // Nodes + subgraph cluster_shard1 { + label = "Shard 1"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source1 [label="Source"]; + Shard1 [label="Shard 1"]; + Sink1 [label="Sink"]; + Buffer1 [label="Outgoing Buffer fills to max" shape=box color="red" penwidth=2]; + Queue1 [label="Incoming Queue allowed to fill to 10 PGas unaffected by backpressure" shape=box color="blue" penwidth=2]; + + + Source1 -> Queue1 -> Buffer1 -> Shard1 -> Sink1 [style=invis]; + Queue1 -> Shard1 [style=invis]; + Source1 -> Shard1 [style=invis]; + + Sink1 -> invisible_sink1 [style=invis]; + Buffer1 -> Sink1 [style=invis]; + Source1 -> Queue1 [style=invis]; + Buffer1 -> Shard1 [style=invis]; + + Source1 -> Queue1 [label="blockd tx to shard 1+2" color="red" penwidth=3]; + Shard1 -> Sink1 [label="1000 Tgas / 1000 Tgas" color="green" penwidth=3]; + Queue1 -> Shard1 [label="guaranteed 1000 Tgas flow if congested" color="green" penwidth=3]; + Shard1 -> Buffer1 [label="backpressure" color="red" penwidth=2] + } + + subgraph cluster_shard2 { + label = "Shard 2"; + style=filled; + color=lightgrey; + node [shape=ellipse]; + + Source2 [label="Source"]; + Shard2 [label="Shard 2"]; + Sink2 [label="Sink"]; + Buffer2 [label="Outgoing Buffer fills to max" shape=box color="red" penwidth=2]; + Queue2 [label="Incoming Queue allowed to fill to 10 PGas unaffected by backpressure" shape=box color="blue" penwidth=2]; + + Queue2 -> Shard2 [style=invis]; + Source2 -> Queue2 -> Buffer2 -> Shard2 -> Sink2 [style=invis]; + Source2 -> Shard2 [style=invis]; + invisible_sink1 -> invisible_sink2 [style=invis]; + Source1 -> Source2 [style=invis]; + Sink2 -> invisible_sink2 [style=invis]; + Buffer2 -> Sink2 [style=invis]; + Source2 -> Queue2 [style=invis]; + Queue1 -> Queue2 [style=invis]; + Buffer1 -> Buffer2 [style=invis]; + Buffer2 -> Shard2 [style=invis]; + + Source2 -> Queue2 [label="blocked tx to shard 1+2" color="red" penwidth=3]; + Shard2 -> Sink2 [label="1000 Tgas / 1000 Tgas" color="green" penwidth=3]; + Queue2 -> Shard2 [label="guaranteed 1000 Tgas flow if congested" color="green" penwidth=3]; + Shard2 -> Buffer2 [label="backpressure" color="red" penwidth=2] + } + + // Edges between nodes + Queue2 -> Buffer1 [label="backpressure" color="red" penwidth=2 dir="back"]; + Queue1 -> Buffer2 [label="backpressure" color="red" penwidth=2 dir="back"]; + + + // Aligning subgraphs + // {rank=same; Source1; Source2; Queue1; Queue2;} + {rank=same; Source1; Source2;} + {rank=same; Queue1; Queue2;} + {rank=same; Shard1; Shard2;} + {rank=same; Buffer1; Buffer2;} + // {rank=same; Buffer1; Buffer2; Shard1; Shard2;} + {rank=same; Sink1; Sink2; invisible_sink1; invisible_sink2;} +} diff --git a/docs/images/congestion/receipt_flow_example_4.svg b/docs/images/congestion/receipt_flow_example_4.svg new file mode 100644 index 00000000000..0a3c72fe67b --- /dev/null +++ b/docs/images/congestion/receipt_flow_example_4.svg @@ -0,0 +1,177 @@ + + + + + + +G + + + +invisible_sink1 + + + + +invisible_sink2 + + + + + +Source1 + +Source + + + +Shard1 + +Shard 1 + + + + +Queue1 + +Incoming Queue allowed to fill to 10 PGas unaffected by backpressure + + + + + +Source1->Queue1 + + +blockd tx to shard 1+2 + + + +Source2 + +Source + + + + +Sink1 + +Sink + + + + +Shard1->Sink1 + + +1000 Tgas / 1000 Tgas + + + +Buffer1 + +Outgoing Buffer fills to max + + + +Shard1->Buffer1 + + +backpressure + + + + + + + +Buffer2 + +Outgoing Buffer fills to max + + + + + +Queue1->Shard1 + + +guaranteed 1000 Tgas flow if congested + + + + +Queue1->Buffer2 + + +backpressure + + + +Queue2 + +Incoming Queue allowed to fill to 10 PGas unaffected by backpressure + + + + +Shard2 + +Shard 2 + + + + + + +Source2->Queue2 + + +blocked tx to shard 1+2 + + + +Sink2 + +Sink + + + + +Shard2->Sink2 + + +1000 Tgas / 1000 Tgas + + + +Shard2->Buffer2 + + +backpressure + + + + + + + +Queue2->Buffer1 + + +backpressure + + + + +Queue2->Shard2 + + +guaranteed 1000 Tgas flow if congested + + + + From c2117e9f8ee282efb79e5825e9d3acab515a06b1 Mon Sep 17 00:00:00 2001 From: Jakob Meier Date: Mon, 25 Mar 2024 15:04:12 +0100 Subject: [PATCH 3/5] fix typos and add section about byte vs gas limit --- docs/architecture/how/receipt-congestion.md | 16 ++++++++++++++-- .../congestion/receipt_flow_example_3_1.dot | 2 +- .../congestion/receipt_flow_example_3_1.svg | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/architecture/how/receipt-congestion.md b/docs/architecture/how/receipt-congestion.md index 36b9be409f4..9048878a814 100644 --- a/docs/architecture/how/receipt-congestion.md +++ b/docs/architecture/how/receipt-congestion.md @@ -119,7 +119,7 @@ indirectly requires bandwidth on the congested shard. Crucially, when accepting a transaction, we don't know ahead of time which shards will be affected by the full directed graph of receipts in a transaction. -We only know the first step For multi-jop transactions, there is no easy way out. +We only know the first step. For multi-jop transactions, there is no easy way out. But it is worth mentioning, that in practice the single-hop function call is the most common case. And this case can be handled nicely by rejecting incoming @@ -127,7 +127,7 @@ transactions to congested shards. ## Idea 3: Apply backpressure to stop all flows to a congested shard -On top op stopping transactions to congested shards, we can also stop receipts if they have a congested shard as the receiver. +On top of stopping transactions to congested shards, we can also stop receipts if they have a congested shard as the receiver. We simply put them in a buffer of the sending shard and keep them there until the congested shard has space again for the receipts. @@ -160,6 +160,18 @@ transaction, we still have 1000 Tgas of useful work we can contribute to the total flow. Thus under the assumption that at least 10% of gas is being burnt, we have 100% utilization. +A limit in bytes would be better to argue how much memory we need exactly. But +in some sense, the wo are equivalent, as producing large receipts should cost a +linear amount of gas. What exactly the conversion rate is, is rather complicated +and warrants its own investigation with potential protocol changes to lower the +ratio in the most extreme cases. And this is important regardless of how +congestion is handled, given that network bandwidth is becoming more and more +important as we add more shards. Issue +[#8214](https://github.com/near/nearcore/issues/8214) tracks our effort on +estimating what that cost should be and +[#9378](https://github.com/near/nearcore/issues/9378) tracks our best progress +on calculating what it is today. + Of course, we can increase the queue to have even better utility guarantees. But it comes at the cost of longer delays for every transaction or receipt that goes through a congested shard. diff --git a/docs/images/congestion/receipt_flow_example_3_1.dot b/docs/images/congestion/receipt_flow_example_3_1.dot index e4453d4cc73..a0bd2049dc6 100644 --- a/docs/images/congestion/receipt_flow_example_3_1.dot +++ b/docs/images/congestion/receipt_flow_example_3_1.dot @@ -44,7 +44,7 @@ digraph G { // Edges - Source1 -> Shard1 [label="blockd tx to shard 1+2" color="red" penwidth=3]; + Source1 -> Shard1 [label="blocked tx to shard 1+2" color="red" penwidth=3]; Source2 -> Shard2 [label="blocked tx to shard 1+2" color="red" penwidth=3]; Shard1 -> Sink1 [label="0 Tgas / 1000 Tgas (deadlock)"]; diff --git a/docs/images/congestion/receipt_flow_example_3_1.svg b/docs/images/congestion/receipt_flow_example_3_1.svg index e9316ee0f73..37a127e2187 100644 --- a/docs/images/congestion/receipt_flow_example_3_1.svg +++ b/docs/images/congestion/receipt_flow_example_3_1.svg @@ -37,7 +37,7 @@ Source1->Shard1 -blockd tx to shard 1+2 +blocked tx to shard 1+2 From 759243965beeac161b384ac2db33b124030dbaea Mon Sep 17 00:00:00 2001 From: Jakob Meier Date: Thu, 28 Mar 2024 15:47:14 +0100 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: wacban --- docs/architecture/how/receipt-congestion.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/architecture/how/receipt-congestion.md b/docs/architecture/how/receipt-congestion.md index 9048878a814..e02d59dd6fe 100644 --- a/docs/architecture/how/receipt-congestion.md +++ b/docs/architecture/how/receipt-congestion.md @@ -55,7 +55,7 @@ cross-shard congestion design proposed in One approach to solve congestion would be to never allow more work into the system than we can execute. -But this is not ideal Just consider this example where everybody tries to access +But this is not ideal. Just consider this example where everybody tries to access a contract on the same shard. ![graph](../../images/congestion/receipt_flow_example_1.svg) @@ -119,7 +119,7 @@ indirectly requires bandwidth on the congested shard. Crucially, when accepting a transaction, we don't know ahead of time which shards will be affected by the full directed graph of receipts in a transaction. -We only know the first step. For multi-jop transactions, there is no easy way out. +We only know the first step. For multi-hop transactions, there is no easy way out. But it is worth mentioning, that in practice the single-hop function call is the most common case. And this case can be handled nicely by rejecting incoming @@ -146,7 +146,7 @@ send receipts to a shard that has not enough work in the delayed receipts queue already. Basically, the backpressure limits from idea 3 are only applied to incoming -receipts but not for the total size. This guarantees that in congested the +receipts but not for the total size. This guarantees that in the congested scenario that previously caused a deadlock, we always have something in the incoming queue to work on, otherwise there wouldn't be backpressure at all. @@ -161,7 +161,7 @@ total flow. Thus under the assumption that at least 10% of gas is being burnt, we have 100% utilization. A limit in bytes would be better to argue how much memory we need exactly. But -in some sense, the wo are equivalent, as producing large receipts should cost a +in some sense, the two are equivalent, as producing large receipts should cost a linear amount of gas. What exactly the conversion rate is, is rather complicated and warrants its own investigation with potential protocol changes to lower the ratio in the most extreme cases. And this is important regardless of how From d642476dbcd864142b85d9e15170982b1c7b71f2 Mon Sep 17 00:00:00 2001 From: Jakob Meier Date: Thu, 28 Mar 2024 16:05:00 +0100 Subject: [PATCH 5/5] fix errors found in code review --- docs/architecture/how/receipt-congestion.md | 2 +- .../congestion/receipt_flow_example_1.dot | 1 + .../congestion/receipt_flow_example_1.svg | 79 ++++++++++--------- .../congestion/receipt_flow_example_1_1.dot | 1 + .../congestion/receipt_flow_example_1_1.svg | 79 ++++++++++--------- .../congestion/receipt_flow_example_2.dot | 2 +- .../congestion/receipt_flow_example_2.svg | 2 +- 7 files changed, 91 insertions(+), 75 deletions(-) diff --git a/docs/architecture/how/receipt-congestion.md b/docs/architecture/how/receipt-congestion.md index e02d59dd6fe..fe9d67f21ff 100644 --- a/docs/architecture/how/receipt-congestion.md +++ b/docs/architecture/how/receipt-congestion.md @@ -75,7 +75,7 @@ really, as this limit is hardly practical. It means we limit global throughput to that of a single shard. Then why would we do sharding in the first place? The sad thing is, there is no way around it in the most general case. A -congestion control strategy that does apply this limit to this workload will +congestion control strategy that does not apply this limit to this workload will always have infinitely sized queues. Of course, we won't give up. We are not limited to a constant capacity limit, we diff --git a/docs/images/congestion/receipt_flow_example_1.dot b/docs/images/congestion/receipt_flow_example_1.dot index 4f5b5e83ba7..a54eb7c7e98 100644 --- a/docs/images/congestion/receipt_flow_example_1.dot +++ b/docs/images/congestion/receipt_flow_example_1.dot @@ -63,6 +63,7 @@ digraph G { Shard1 -> Shard3 [label="999 Tgas"]; Shard2 -> Shard3 [label="999 Tgas"]; + Shard3 -> Shard3 [label="999 Tgas", dir="back"]; // Aligning subgraphs {rank=same; Source1; Source2; Source3;} diff --git a/docs/images/congestion/receipt_flow_example_1.svg b/docs/images/congestion/receipt_flow_example_1.svg index 0b64e6ec9f8..cf90ff9fe2e 100644 --- a/docs/images/congestion/receipt_flow_example_1.svg +++ b/docs/images/congestion/receipt_flow_example_1.svg @@ -29,21 +29,21 @@ Source1 - -Source + +Source Shard1 - -Shard 1 + +Shard 1 Source1->Shard1 - - -1000 Tgas + + +1000 Tgas @@ -54,42 +54,42 @@ Shard1->Sink1 - - -1 Tgas / 1000 Tgas + + +1 Tgas / 1000 Tgas Shard3 - -Shard 3 + +Shard 3 Shard1->Shard3 - - -999 Tgas + + +999 Tgas Source2 - -Source + +Source Shard2 - -Shard 2 + +Shard 2 Source2->Shard2 - - -1000 Tgas + + +1000 Tgas @@ -100,30 +100,37 @@ Shard2->Sink2 - - -1 Tgas / 1000 Tgas + + +1 Tgas / 1000 Tgas Shard2->Shard3 - - -999 Tgas + + +999 Tgas Source3 - -Source + +Source Source3->Shard3 - - -1000 Tgas + + +1000 Tgas + + + +Shard3->Shard3 + + +999 Tgas @@ -134,9 +141,9 @@ Shard3->Sink3 - - -2998 Tgas / 1000 Tgas + + +2998 Tgas / 1000 Tgas diff --git a/docs/images/congestion/receipt_flow_example_1_1.dot b/docs/images/congestion/receipt_flow_example_1_1.dot index 855dedcaafc..82c4dee9c49 100644 --- a/docs/images/congestion/receipt_flow_example_1_1.dot +++ b/docs/images/congestion/receipt_flow_example_1_1.dot @@ -63,6 +63,7 @@ digraph G { Shard1 -> Shard3 [label="332 Tgas"]; Shard2 -> Shard3 [label="332 Tgas"]; + Shard3 -> Shard3 [label="332 Tgas", dir="back"]; // Aligning subgraphs {rank=same; Source1; Source2; Source3;} diff --git a/docs/images/congestion/receipt_flow_example_1_1.svg b/docs/images/congestion/receipt_flow_example_1_1.svg index aba610fcbc1..560ef7506d8 100644 --- a/docs/images/congestion/receipt_flow_example_1_1.svg +++ b/docs/images/congestion/receipt_flow_example_1_1.svg @@ -29,21 +29,21 @@ Source1 - -Source + +Source Shard1 - -Shard 1 + +Shard 1 Source1->Shard1 - - -333 Tgas + + +333 Tgas @@ -54,42 +54,42 @@ Shard1->Sink1 - - -1 Tgas / 1000 Tgas + + +1 Tgas / 1000 Tgas Shard3 - -Shard 3 + +Shard 3 Shard1->Shard3 - - -332 Tgas + + +332 Tgas Source2 - -Source + +Source Shard2 - -Shard 2 + +Shard 2 Source2->Shard2 - - -333 Tgas + + +333 Tgas @@ -100,30 +100,37 @@ Shard2->Sink2 - - -1 Tgas / 1000 Tgas + + +1 Tgas / 1000 Tgas Shard2->Shard3 - - -332 Tgas + + +332 Tgas Source3 - -Source + +Source Source3->Shard3 - - -333 Tgas + + +333 Tgas + + + +Shard3->Shard3 + + +332 Tgas @@ -134,9 +141,9 @@ Shard3->Sink3 - - -997 Tgas / 1000 Tgas + + +997 Tgas / 1000 Tgas diff --git a/docs/images/congestion/receipt_flow_example_2.dot b/docs/images/congestion/receipt_flow_example_2.dot index 81280f37ab4..0aa01351268 100644 --- a/docs/images/congestion/receipt_flow_example_2.dot +++ b/docs/images/congestion/receipt_flow_example_2.dot @@ -96,7 +96,7 @@ digraph G { Shard1 -> Sink1 [label="10 Tgas / 1000 Tgas"]; Shard2 -> Sink2 [label="10 Tgas / 1000 Tgas"]; - Shard3 -> Sink3 [label="1980 Tgas / 1000 Tgas" color="red" penwidth=3]; + Shard3 -> Sink3 [label="1960 Tgas / 1000 Tgas" color="red" penwidth=3]; Shard4 -> Sink4 [label="10 Tgas / 1000 Tgas"]; Shard5 -> Sink5 [label="10 Tgas / 1000 Tgas"]; diff --git a/docs/images/congestion/receipt_flow_example_2.svg b/docs/images/congestion/receipt_flow_example_2.svg index d0cfd6553ab..c4537fd551c 100644 --- a/docs/images/congestion/receipt_flow_example_2.svg +++ b/docs/images/congestion/receipt_flow_example_2.svg @@ -157,7 +157,7 @@ Shard3->Sink3 -1980 Tgas / 1000 Tgas +1960 Tgas / 1000 Tgas