From 08425978767a162718c28b9fe5087a5bd7696007 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 14:18:07 -0700
Subject: [PATCH 01/50] initial commit (data synthesizer)

---
 ATTRIBUTIONS.md                         |  38 +++
 benchmark/__init__.py                   |   0
 benchmark/data_synth/README.md          |  35 ++
 benchmark/data_synth/__init__.py        |   0
 benchmark/data_synth/graph_utils.py     | 109 ++++++
 benchmark/data_synth/prefix_analyzer.py | 336 ++++++++++++++++++
 benchmark/data_synth/sampler.py         |  38 +++
 benchmark/data_synth/synthesizer.py     | 433 ++++++++++++++++++++++++
 benchmark/tests/__init__.py             |   0
 benchmark/tests/test_sampler.py         |  31 ++
 benchmark/tests/test_syntheiszer.py     |  66 ++++
 benchmark/utils/__init__.py             |   0
 benchmark/utils/logging.py              |  38 +++
 container/deps/requirements.txt         |   1 +
 14 files changed, 1125 insertions(+)
 create mode 100644 benchmark/__init__.py
 create mode 100644 benchmark/data_synth/README.md
 create mode 100644 benchmark/data_synth/__init__.py
 create mode 100644 benchmark/data_synth/graph_utils.py
 create mode 100644 benchmark/data_synth/prefix_analyzer.py
 create mode 100644 benchmark/data_synth/sampler.py
 create mode 100644 benchmark/data_synth/synthesizer.py
 create mode 100644 benchmark/tests/__init__.py
 create mode 100644 benchmark/tests/test_sampler.py
 create mode 100644 benchmark/tests/test_syntheiszer.py
 create mode 100644 benchmark/utils/__init__.py
 create mode 100644 benchmark/utils/logging.py

diff --git a/ATTRIBUTIONS.md b/ATTRIBUTIONS.md
index ffc99b5a31..5434dd6a21 100644
--- a/ATTRIBUTIONS.md
+++ b/ATTRIBUTIONS.md
@@ -228,6 +228,44 @@ limitations under the License.
 
    ```
 
+## networkx - [3-Clause BSD License](https://github.com/networkx/networkx/blob/main/LICENSE.txt)
+
+   ```
+Copyright (C) 2004-2024, NetworkX Developers
+Aric Hagberg <hagberg@lanl.gov>
+Dan Schult <dschult@colgate.edu>
+Pieter Swart <swart@lanl.gov>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+   * Neither the name of the NetworkX Developers nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+   ```
+
 ## ucx-py-cu12 - [BSD 3-Clause "New" or "Revised" License](https://github.com/rapidsai/ucx-py/blob/main/LICENSE)
 
    ```
diff --git a/benchmark/__init__.py b/benchmark/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/benchmark/data_synth/README.md b/benchmark/data_synth/README.md
new file mode 100644
index 0000000000..375331a98e
--- /dev/null
+++ b/benchmark/data_synth/README.md
@@ -0,0 +1,35 @@
+This directory is currently used for generate synthetic data based on the mooncake dataset, but should be easily extendible to any request datasets with (prefix) hash ids.
+
+## Quickstart
+
+For instance, you can run:
+```
+python -m benchmark.data_synth.synthesizer \
+--input-file mooncake_trace.jsonl \
+--num-requests 500 \
+--depth-multiplier 4 \
+--width-multiplier 4 \
+--prompt-len-multiplier 0.1
+```
+where `num-requests` sets the number of total synthetic requests generated, `speedup-ratio` tunes the rate at which the requests are sent, `depth-multiplier` tunes the lengths of the request prefixes (higher multiplier will then yield longer ISLs), and `width-multiplier` controls the branching factor of the synthetic requests (higher multiplier will generate more diverse request patterns).
+
+
+The synthesizer is designed to work for jsonl files in the "mooncake" trace file format, meaning that the input are increasing integers of block hashes. For now, new block hashes must be the next consecutive integer, otherwise will not work.
+
+## How it works
+
+The generation algorithm, simplified, is as follows
+
+- Store the hash ids in a directed tree structure (prefix tree)
+- Each directed edge `weight` indicates how many times the edge is traversed, which is needed to compute transition probabilities.
+- Contract unary paths (chains) in the tree so that it is in a radix-tree form, meaning every node that is the only child will be contracted with the parent. As a consequence, each node need to store an attribute `length` to indicate the compressed length (1 if no compression). The depth multiplier scales this compressed length (rounded to the nearest integer), effectively increasing the length of each radix node.
+- Identify every leaf node that is visited only once, and prune them from the tree, as they are highly likely not part of the core radix tree. In other words, we do not need to store nodes that are part of the actual user prompts.
+- At this stage, each node will have (possibly zero) transition probabilities to a child prefix node, to a "user prompt" node, and to a "termination" node. Use these probabilities to sample a path in the core radix tree, the append the path with new hash ids corresponding to a user prompt of length sampled from the dataset. The width multiplier effectively duplicates the entire radix tree the specified number of times, each with a new set of hash ids, creating more diverse request patterns.
+
+## Testing
+
+To test for "correctness", or faithfulness to the original mooncake statistics, one can run
+```
+python mooncake_synth.py --num-requests 500000
+```
+and compare the synthetic ISL statistics (mean, median, std) to the original ISL statistics. I find this to be the most "robust" end-to-end test.
diff --git a/benchmark/data_synth/__init__.py b/benchmark/data_synth/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/benchmark/data_synth/graph_utils.py b/benchmark/data_synth/graph_utils.py
new file mode 100644
index 0000000000..4437a12e92
--- /dev/null
+++ b/benchmark/data_synth/graph_utils.py
@@ -0,0 +1,109 @@
+import numpy as np
+import networkx as nx
+
+from benchmark.data_synth.sampler import get_cdf
+
+
+def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
+    """Make the graph radix-like (meaning all unary paths are contracted).
+    In addition, keep track of the contracted lengths.
+
+    Args:
+        G (networkx.DiGraph): A directed graph representing a prefix tree structure.
+
+    Returns:
+        networkx.DiGraph: The modified graph with unary paths contracted.
+    """
+    for visited in sorted(np.unique([G.nodes[node]["visited"] for node in G.nodes()])):
+        sub_nodes = [node for node in G.nodes() if G.nodes[node]["visited"] == visited]
+        subgraph = G.subgraph(sub_nodes)
+        if len(subgraph) == 1:
+            continue
+
+        chain_nodes = [
+            node
+            for node in subgraph.nodes()
+            if G.in_degree(node) == 1 and G.out_degree(node) == 1
+        ]
+        if not chain_nodes:
+            continue
+        chain_nodes = sorted(chain_nodes)
+
+        nodes_rm = []
+        for node in chain_nodes:
+            node_pred = list(G.predecessors(node))[0]
+            # find the parent node source
+            if G.nodes[node_pred]["visited"] == visited:
+                continue
+            weight = G[node_pred][node]["weight"]
+
+            end_node = node
+            chain_len = 1
+            succ = list(G.successors(end_node))
+
+            # find the end of the chain
+            while succ and G.nodes[succ[0]]["visited"] == visited:
+                nodes_rm.append(end_node)
+                end_node = succ[0]
+                chain_len += 1
+                succ = list(G.successors(end_node))
+
+            G.add_edge(node_pred, end_node, weight=weight)
+            G.nodes[end_node]["length"] = chain_len
+
+        G.remove_nodes_from(nodes_rm)
+
+    for node in G.nodes():
+        if "length" not in G.nodes[node]:
+            G.nodes[node]["length"] = 1
+
+    return G
+
+
+def _remove_leaves(G: nx.DiGraph) -> tuple[nx.DiGraph, list[int]]:
+    leaves = {
+        node: G.nodes[node]["length"]
+        for node in G.nodes()
+        if G.nodes[node]["visited"] == 1
+    }
+    leaves_id = list(leaves.keys())
+    leaves_len = list(leaves.values())
+    G.remove_nodes_from(leaves_id)
+    return G, leaves_len
+
+
+def _precompute_transition_cdfs(G: nx.DiGraph) -> nx.DiGraph:
+    for node in G.nodes():
+        out_edges = list(G.out_edges(node))
+        weights = [G[edge[0]][edge[1]]["weight"] for edge in out_edges] + [
+            G.nodes[node]["to_leaf"],
+            G.nodes[node]["end"],
+        ]
+        G.nodes[node]["out_cdf"] = get_cdf(weights)
+        G.nodes[node]["out_nodes"] = [edge[1] for edge in out_edges] + [-2, -3]
+
+    return G
+
+
+def _validate_graph(G: nx.DiGraph) -> bool:
+    for node in G.nodes():
+        # Skip nodes without parents or children
+        if G.in_degree(node) == 0 or G.out_degree(node) == 0:
+            continue
+
+        # Get incoming edge weight (should only be one parent)
+        parent = list(G.predecessors(node))[0]
+        in_weight = G[parent][node]["weight"]
+
+        # Sum outgoing edge weights
+        out_weights = [G[node][child]["weight"] for child in G.successors(node)]
+        out_weights += [G.nodes[node]["to_leaf"], G.nodes[node]["end"]]
+
+        # Compare weights (using np.isclose for float comparison)
+        if not in_weight == sum(out_weights):
+            raise ValueError(
+                f"Weight mismatch at node {node}: "
+                f"incoming weight {in_weight} != sum of outgoing weights {sum(out_weights)}"
+            )
+
+    return True
diff --git a/benchmark/data_synth/prefix_analyzer.py b/benchmark/data_synth/prefix_analyzer.py
new file mode 100644
index 0000000000..cfa44875f8
--- /dev/null
+++ b/benchmark/data_synth/prefix_analyzer.py
@@ -0,0 +1,336 @@
+import json
+from collections import Counter
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+from tqdm import tqdm
+
+
+class PrefixAnalyzer:
+    """
+    A class for analyzing dataset characteristics related to prefixes, hash IDs, and cache hit rates.
+    """
+
+    def __init__(self, dataset_path, block_size=1):
+        """
+        Initialize the analyzer with dataset path and block size.
+
+        Args:
+            dataset_path: Path to the JSONL dataset file
+            block_size: Size of each block for prefix calculation
+        """
+        self.dataset_path = dataset_path
+        self.block_size = block_size
+        self.dataset = self._load_dataset()
+        self.hash_counter = self._build_hash_counter()
+        self.repeated_hash_ids = self._find_repeated_hash_ids()
+
+    def _load_dataset(self):
+        print(f"Loading dataset from {self.dataset_path}...")
+        dataset = []
+        with open(self.dataset_path, "r") as f:
+            for line in f:
+                dataset.append(json.loads(line))
+        print(f"Dataset loaded: {len(dataset)} examples")
+        return dataset
+
+    def _build_hash_counter(self):
+        all_hash_ids = []
+        for item in tqdm(self.dataset, desc="Processing hash IDs"):
+            all_hash_ids.extend(item["hash_ids"])
+        counter = Counter(all_hash_ids)
+        print(f"Hash counter built: {len(counter)} unique hash IDs")
+        return counter
+
+    def _find_repeated_hash_ids(self):
+        return {hash_id for hash_id, count in self.hash_counter.items() if count > 1}
+
+    def analyze_dataset_lengths(self):
+        """
+        Analyze dataset to extract various length metrics and print statistics.
+
+        Returns:
+            Tuple of lists: (input_lengths, prefix_lengths, user_prompt_lengths, output_lengths)
+        """
+        # Extract input and output lengths directly from fields
+        input_lengths = [item["input_length"] for item in self.dataset]
+        output_lengths = [item["output_length"] for item in self.dataset]
+
+        # Calculate prefix length and user prompt length for each row
+        prefix_lengths = []
+        user_prompt_lengths = []
+
+        for i, item in tqdm(
+            enumerate(self.dataset),
+            total=len(self.dataset),
+            desc="Analyzing dataset lengths",
+        ):
+            input_len = item["input_length"]
+            hash_ids = item["hash_ids"]
+            assert len(hash_ids) * self.block_size >= input_len
+
+            # Special case: if all hash IDs in the row are repeated elsewhere
+            if all(hash_id in self.repeated_hash_ids for hash_id in hash_ids):
+                prefix_len = input_len  # Set prefix length to input length
+                user_prompt_len = 0  # Set user prompt length to 0
+            else:
+                # Count how many hash IDs in this row are repeated elsewhere in the dataset
+                repeated_count = sum(
+                    1 for hash_id in hash_ids if hash_id in self.repeated_hash_ids
+                )
+                prefix_len = repeated_count * self.block_size
+                user_prompt_len = input_len - prefix_len
+
+            prefix_lengths.append(prefix_len)
+            user_prompt_lengths.append(user_prompt_len)
+
+            # Check if prefix length is greater than input length
+            if prefix_len > input_len:
+                print(f"WARNING: Line {i}: {json.dumps(item)}")
+
+        # Print statistics table
+        metrics = {
+            "Input Length": input_lengths,
+            "Prefix Length": prefix_lengths,
+            "User Prompt Length": user_prompt_lengths,
+            "Output Length": output_lengths,
+        }
+
+        print_statistics_table(metrics)
+
+        return input_lengths, prefix_lengths, user_prompt_lengths, output_lengths
+
+    def visited_radix_lens(self, ax=None, legend=None):
+        """
+        Analyze radix lengths based on hash IDs with the same repetition count as the first hash.
+
+        Args:
+            ax: Matplotlib axis handle for plotting (if None, creates a new figure)
+            legend: Legend label for this dataset in the plot
+
+        Returns:
+            The matplotlib axis handle
+        """
+        # For each row, calculate the radix length based on the first hash's repetition count
+        radix_lengths = []
+
+        for item in tqdm(self.dataset):
+            # Skip if there are no hash_ids
+            if len(item["hash_ids"]) == 0:
+                continue
+
+            # Get the repetition count of the first hash ID
+            first_hash = item["hash_ids"][0]
+            first_hash_repetition_count = self.hash_counter[first_hash]
+            if first_hash_repetition_count <= 1:
+                continue
+
+            # Count how many hash IDs in this row have the same repetition count
+            matching_hash_ids = sum(
+                1
+                for hash_id in item["hash_ids"]
+                if self.hash_counter[hash_id] == first_hash_repetition_count
+            )
+
+            # Calculate radix length
+            radix_length = matching_hash_ids * self.block_size
+            radix_lengths.append((first_hash, radix_length))
+
+        # Count occurrences of each (first_hash, radix_length) tuple
+        radix_lens_counter = Counter(radix_lengths)
+
+        # Create a new figure if no axis is provided
+        if ax is None:
+            fig, ax = plt.subplots(figsize=(10, 6))
+
+        # Extract x and y values for plotting
+        x_values = [tup[1] for tup in radix_lens_counter.keys()]
+        y_values = list(radix_lens_counter.values())
+
+        # Plot with legend if provided
+        scatter = ax.scatter(
+            x_values, y_values, alpha=0.7, label=legend + " (trunk)" if legend else None
+        )
+
+        # Now analyze based on the last hash with nonzero repeat count
+        radix_lengths_last = []
+        for item in tqdm(self.dataset):
+            # Skip if there are no hash_ids
+            if len(item["hash_ids"]) == 0:
+                continue
+
+            # Find the last hash with nonzero repeat count
+            last_hash = None
+            for hash_id in reversed(item["hash_ids"]):
+                if self.hash_counter[hash_id] > 1:
+                    last_hash = hash_id
+                    break
+
+            if last_hash is None:
+                continue
+
+            last_hash_repetition_count = self.hash_counter[last_hash]
+
+            # Count how many hash IDs in this row have repeat count greater than the last hash's
+            matching_hash_ids = sum(
+                1
+                for hash_id in item["hash_ids"]
+                if self.hash_counter[hash_id] > last_hash_repetition_count
+            )
+
+            # Calculate radix length
+            radix_length = matching_hash_ids * self.block_size
+            radix_lengths_last.append((last_hash, radix_length))
+
+        # Count occurrences of each (last_hash, radix_length) tuple
+        radix_lens_last_counter = Counter(radix_lengths_last)
+
+        # Extract x and y values for plotting
+        x_values_last = [tup[1] for tup in radix_lens_last_counter.keys()]
+        y_values_last = list(radix_lens_last_counter.values())
+
+        # Plot with legend if provided
+        scatter_last = ax.scatter(
+            x_values_last,
+            y_values_last,
+            alpha=0.7,
+            marker="x",
+            label=legend + " (branch)" if legend else None,
+        )
+
+        # Add legend if any labels exist
+        if legend is not None:
+            ax.legend()
+
+        ax.set_xlabel("Radix Length")
+        ax.set_ylabel("Visited")
+        ax.set_xscale("log")
+        # ax.set_yscale('log')
+        ax.grid(True, linestyle="--", alpha=0.7)
+
+        # Return the axis for further modifications
+        return ax
+
+    def analyze_cache_hit_rates(self):
+        """
+        Analyze theoretical cache hit rates based on hash ID repetition.
+
+        Returns:
+            List of cache hit rates for each row in the dataset
+        """
+        # Set to track all hash IDs we've seen
+        seen_hash_ids = set()
+
+        # Store cache hit rates for each row
+        cache_hit_rates = []
+
+        for item in tqdm(self.dataset, desc="Calculating cache hit rates"):
+            hash_ids = item["hash_ids"]
+
+            # Skip if there are no hash IDs
+            if len(hash_ids) == 0:
+                continue
+
+            # Find the first index where the hash ID hasn't been seen before
+            first_unseen_idx = len(hash_ids)  # Default if all are seen
+            for idx, hash_id in enumerate(hash_ids):
+                if hash_id not in seen_hash_ids:
+                    first_unseen_idx = idx
+                    break
+
+            # Calculate cache hit rate
+            cache_hit_rate = first_unseen_idx / len(hash_ids)
+            cache_hit_rates.append(cache_hit_rate)
+
+            # Add all hash IDs from this row to the seen set
+            seen_hash_ids.update(hash_ids)
+
+        # Create histogram of cache hit rates
+        plt.figure(figsize=(10, 6))
+        plt.hist(
+            cache_hit_rates, bins=50, alpha=0.7, color="skyblue", edgecolor="black"
+        )
+        plt.xlabel("Cache Hit Rate")
+        plt.ylabel("Frequency")
+        plt.title("Theoretical Cache Hit Rates")
+
+        # Add statistics text to the plot directly
+        stats_text = (
+            f"Mean: {np.mean(cache_hit_rates):.4f}\n"
+            f"Median: {np.median(cache_hit_rates):.4f}\n"
+            f"Min: {np.min(cache_hit_rates):.4f}\n"
+            f"Max: {np.max(cache_hit_rates):.4f}\n"
+            f"Std Dev: {np.std(cache_hit_rates):.4f}"
+        )
+
+        # Position the text in the upper right corner with some padding
+        plt.text(
+            0.95,
+            0.95,
+            stats_text,
+            transform=plt.gca().transAxes,
+            verticalalignment="top",
+            horizontalalignment="right",
+            bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
+        )
+
+        plt.grid(True, linestyle="--", alpha=0.7)
+        plt.savefig("theoretical_hit_rates.png", dpi=300, bbox_inches="tight")
+        plt.close()
+
+        return cache_hit_rates
+
+
+def print_statistics_table(metrics):
+    """
+    Print a formatted table of statistics for the given metrics.
+
+    Args:
+        metrics: Dictionary mapping metric names to lists of values
+    """
+    stats_data = []
+    for metric_name, values in metrics.items():
+        stats_data.append(
+            {
+                "Metric": metric_name,
+                "Mean": np.mean(values),
+                "Std Dev": np.std(values),
+                "Min": np.min(values),
+                "P25": np.percentile(values, 25),
+                "Median": np.median(values),
+                "P75": np.percentile(values, 75),
+                "Max": np.max(values),
+            }
+        )
+
+    # Create DataFrame from the collected statistics
+    stats_df = pd.DataFrame(stats_data)
+    stats_df = stats_df.set_index("Metric")
+    stats_df = stats_df.round(2)
+
+    # Print the table using tabulate with a pretty format
+    print(tabulate(stats_df, headers="keys", tablefmt="pretty"))
+
+
+if __name__ == "__main__":
+    # Main routine that uses the specified dataset with block size of 16
+    block_size = 16
+    dataset_path = f"../datasets/avian_r100000_bs{block_size}_synth.jsonl"
+    # dataset_path = "/home/rupei/nova-benchmarking/datasets/gen_prompts_32k_2_languages_16.jsonl"
+
+    print(f"Analyzing dataset: {dataset_path}")
+    print(f"Using block size: {block_size}")
+    print()
+
+    # Create analyzer instance
+    analyzer = PrefixAnalyzer(dataset_path, block_size=block_size)
+
+    # Run analyses
+    input_lens, prefix_lens, user_prompt_lens, output_lens = (
+        analyzer.analyze_dataset_lengths()
+    )
+    analyzer.analyze_cache_hit_rates()
+
+    print(f"\nAnalysis complete. Processed {len(input_lens)} examples.")
diff --git a/benchmark/data_synth/sampler.py b/benchmark/data_synth/sampler.py
new file mode 100644
index 0000000000..f5f9457b44
--- /dev/null
+++ b/benchmark/data_synth/sampler.py
@@ -0,0 +1,38 @@
+from collections import Counter
+from typing import List, Tuple, Dict, Any, Union, Optional
+
+import numpy as np
+from numpy.random import Generator
+
+
+def get_cdf(weights: List[float]) -> np.ndarray:
+    cumsum = np.cumsum(weights)
+    return cumsum / cumsum[-1]
+
+
+def data_to_cdf(data: np.ndarray) -> Tuple[List[Any], np.ndarray]:
+    sorted_counter: Dict[Any, int] = dict(sorted(Counter(data).items()))
+    data_unique: List[Any] = list(sorted_counter.keys())
+    counter_cdf: np.ndarray = get_cdf(list(sorted_counter.values()))
+    return data_unique, counter_cdf
+
+
+def sample_from_cdf(
+    data: List[Any], cdf: np.ndarray, rng: Optional[Generator] = None
+) -> Any:
+    # NOTE: assumes (but does not verify) that the CDF is valid
+    # CDF stands for cumulative distribution function
+    assert len(data) == len(cdf)
+    if rng is not None:
+        return data[np.searchsorted(cdf, rng.random())]
+    else:
+        return data[np.searchsorted(cdf, np.random.rand())]
+
+
+class EmpiricalSampler:
+    def __init__(self, data: Union[List[Any], np.ndarray]) -> None:
+        self.rng = np.random.default_rng(0)
+        self.data, self.cdf = data_to_cdf(np.array(data))
+
+    def sample(self) -> Any:
+        return sample_from_cdf(self.data, self.cdf, self.rng)
diff --git a/benchmark/data_synth/synthesizer.py b/benchmark/data_synth/synthesizer.py
new file mode 100644
index 0000000000..92f2202655
--- /dev/null
+++ b/benchmark/data_synth/synthesizer.py
@@ -0,0 +1,433 @@
+import json
+from collections import Counter
+
+import networkx as nx
+import numpy as np
+import pandas as pd
+
+from benchmark.utils.logging import calculate_and_print_statistics
+from benchmark.data_synth.graph_utils import (
+    _merge_chains,
+    _precompute_transition_cdfs,
+    _remove_leaves,
+)
+from benchmark.data_synth.sampler import EmpiricalSampler, sample_from_cdf
+
+
+class Synthesizer:
+    def __init__(
+        self,
+        dataset_file: str,
+        block_size: int = 512,
+        num_copies: int = 1,
+        speedup_ratio: float = 1.0,
+        context_len_multiplier: float = 1.0,
+        prompt_len_multiplier: float = 1.0,
+    ):
+        """Load the mooncake dataset and extract core statistics like
+        radix-tree structure, ISL, OSL, and request timings.
+        Generate synthetic datasets based on these statistics, with tunable knobs,
+        e.g. to increase request rate or the ISL.
+
+        A request is broken into two parts: a context and a prompt. A context is
+        any block that is (can possibly be) visited more than once, while a prompt
+        is considered to be unique and only visited once (user prompt).
+
+        Args:
+            dataset_file (str): The mooncake trace file in jsonl format.
+            block_size (int, optional): The block size for prefilling and decoding.
+                Defaults to 512.
+            speedup_ratio (int, optional): For speeding up the request intervals.
+                Defaults to 1.
+            context_len_multiplier (float, optional): For every node in the core radix-tree,
+                increase the substring length by this multiplier, and rounded to the nearest
+                multiple of the block size. In other words, shared prefix prompts will be
+                expanded by this factor. Defaults to 1.
+            num_copies (int, optional): Number of times to replicate the core radix tree.
+                Defaults to 1.
+            prompt_len_multiplier (float, optional): Multiplies the leaf path lengths by this factor
+                (rounded to integers). Use values < 1 to generate shorter prompts. Defaults to 1.
+                Note this does not affect the lengths of the core context prompts.
+
+        NOTE: currently may only work for the mooncake trace file,
+            as it assumes consecutive integers
+
+        NOTE: If the context_len_multiplier is not one, then the synthetic data
+            cannot be mixed and matched with the original trace file,
+            as the hash ids will be relabeled.
+        """
+        self.block_size = block_size
+        self.num_copies = num_copies
+        self.speedup_ratio = float(speedup_ratio)
+        self.context_len_multiplier = float(context_len_multiplier)
+        self.prompt_len_multiplier = float(prompt_len_multiplier)
+
+        # assert correct arg bounds
+        assert (
+            isinstance(self.num_copies, int) and self.num_copies >= 1
+        ), "num_copies must be an integer greater than 1"
+        assert (
+            isinstance(self.speedup_ratio, float) and self.speedup_ratio > 0
+        ), "speedup_ratio must be a positive float"
+        assert (
+            isinstance(self.context_len_multiplier, float)
+            and self.context_len_multiplier > 0
+        ), "context_len_multiplier must be a positive float"
+        assert (
+            isinstance(self.prompt_len_multiplier, float)
+            and self.prompt_len_multiplier > 0
+        ), "prompt_len_multiplier must be a positive float"
+
+        # extract data from json file
+        with open(dataset_file, "r") as f:
+            hash_ids_list = [np.array(json.loads(line)["hash_ids"]) for line in f]
+        with open(dataset_file, "r") as f:
+            timestamps = [int(json.loads(line)["timestamp"]) for line in f]
+        with open(dataset_file, "r") as f:
+            input_lens = [np.array(json.loads(line)["input_length"]) for line in f]
+        with open(dataset_file, "r") as f:
+            output_lens = [int(json.loads(line)["output_length"]) for line in f]
+
+        # represent prefix-tree as directed graph
+        self.G = nx.DiGraph()
+        max_hash_id = -1
+        num_paths = 0
+
+        self.G.add_node(-1, end=0)
+        for hash_ids in hash_ids_list:
+            num_paths += 1
+            for i in range(len(hash_ids)):
+                u = hash_ids[i - 1] if i > 0 else -1
+                v = hash_ids[i]
+                max_hash_id = max(v, max_hash_id)
+
+                if v in self.G:
+                    self.G.nodes[v]["visited"] += 1
+                else:
+                    self.G.add_node(v, visited=1, end=0)
+
+                if self.G.has_edge(u, v):
+                    self.G[u][v]["weight"] += 1
+                else:
+                    self.G.add_edge(u, v, weight=1)
+
+            self.G.nodes[v]["end"] += 1
+
+        self.G.nodes[-1]["visited"] = num_paths
+        self.max_hash_id = max_hash_id
+
+        nodes_with_multiple_parents = [
+            (node, d) for node, d in self.G.in_degree() if d > 1
+        ]
+        if nodes_with_multiple_parents:
+            print("WARNING: The following nodes have multiple parents (in-degree > 1):")
+            for node, in_degree in nodes_with_multiple_parents:
+                parents = list(self.G.predecessors(node))
+                print(f"  Node {node}: in-degree={in_degree}, parents={parents}")
+
+        assert all(d <= 1 for _, d in self.G.in_degree()), "Graph is not a tree"
+
+        # visits to leaf nodes (non-core branches) are considered as ended
+        for node in self.G.nodes():
+            if "to_leaf" not in self.G.nodes[node]:
+                self.G.nodes[node]["to_leaf"] = 0
+            if self.G.nodes[node]["visited"] <= 1:
+                continue
+            for child in self.G.successors(node):
+                if self.G.nodes[child]["visited"] == 1:
+                    self.G.nodes[node]["to_leaf"] += 1
+
+        # make graph radix-like
+        self.G = _merge_chains(self.G)
+        self.G, leaves_lens = _remove_leaves(self.G)
+
+        # Apply prompt_len_multiplier to leaves_lens
+        if self.prompt_len_multiplier != 1:
+            leaves_lens = [
+                max(1, round(length * self.prompt_len_multiplier))
+                for length in leaves_lens
+            ]
+
+        self.leaves_lens_sampler = EmpiricalSampler(leaves_lens)
+        self._relabel_nodes()
+        self.G = _precompute_transition_cdfs(self.G)
+
+        # get statistics of timing, request counts, ISL, and OSL
+        request_counts = list(Counter(timestamps).values())
+        self.request_counts_sampler = EmpiricalSampler(request_counts)
+        timedeltas = np.diff(timestamps)
+        timedeltas = timedeltas[timedeltas > 0]
+        self.timedeltas_sampler = EmpiricalSampler(timedeltas)
+        input_lens_mod = np.array(
+            [
+                input_len - (len(hash_ids) - 1) * block_size
+                for input_len, hash_ids in zip(input_lens, hash_ids_list)
+            ]
+        )
+        assert np.all(0 < input_lens_mod) and np.all(input_lens_mod <= self.block_size)
+        self.input_lens_mod_sampler = EmpiricalSampler(input_lens_mod)
+        self.output_lens_sampler = EmpiricalSampler(output_lens)
+
+        print(self)
+
+    def _relabel_nodes(self):
+        # Scale node labels by length multiplier if needed
+        if self.context_len_multiplier > 1:
+            multiplier = int(np.ceil(self.context_len_multiplier))
+
+            # Create mapping for relabeling, preserving -1 and -2
+            mapping = {
+                node: (node if node < 0 else node * multiplier + multiplier)
+                for node in self.G.nodes()
+            }
+            self.G = nx.relabel_nodes(self.G, mapping)
+            # Update max_hash_id
+            self.max_hash_id = multiplier * self.max_hash_id + multiplier
+
+        # Shrink the lengths, but no need to relabel nodes
+        elif self.context_len_multiplier < 1:
+            for node in self.G.nodes():
+                self.G.nodes[node]["length"] = max(
+                    round(self.G.nodes[node]["length"] * self.context_len_multiplier), 1
+                )
+
+    def _synthesize_leaf_path(self):
+        # Sample the leaf path length
+        leaf_length = self.leaves_lens_sampler.sample()
+
+        # Generate new nodes starting from max_hash_id + 1
+        path = [int(self.max_hash_id + 1 + i) for i in range(leaf_length)]
+
+        # Update max_hash_id
+        self.max_hash_id += leaf_length
+
+        return path
+
+    def synthesize_path(self):
+        # Start from root node (-1)
+        current_node = -1
+        path = []
+        context_len = 0
+
+        # Continue until we reach a node with no outgoing edges
+        while True:
+            # Use precomputed CDFs for efficient sampling
+            next_node = sample_from_cdf(
+                self.G.nodes[current_node]["out_nodes"],
+                self.G.nodes[current_node]["out_cdf"],
+            )
+
+            # end early
+            # break and start sampling unique user prompt
+            if next_node == -2:
+                break
+            # break and don't sample leaf
+            if next_node == -3:
+                return path, False, 0
+
+            # otherwise continue down prefix tree
+
+            # Get the length of the contracted path
+            length = self.G.nodes[next_node]["length"]
+            context_len += length * self.block_size
+
+            # Add all intermediate nodes
+            for i in range(length):
+                path.append(int(next_node - (length - 1) + i))
+
+            current_node = next_node
+
+        unique_user_prompt = self._synthesize_leaf_path()
+        # Append a leaf path at the end
+        return path + unique_user_prompt, True, context_len
+
+    def synthesize_requests(self, num_requests, input_len_filter=None):
+        timestamp = 0
+
+        requests = []
+        request_id = 0
+
+        while request_id < num_requests:
+            requests_per_interval = self.request_counts_sampler.sample()
+
+            for _ in range(requests_per_interval):
+                path, leaf_flag, context_len = self.synthesize_path()
+                if leaf_flag:
+                    input_len = (
+                        len(path) - 1
+                    ) * self.block_size + self.input_lens_mod_sampler.sample()
+                else:
+                    input_len = len(path) * self.block_size
+                output_len = self.output_lens_sampler.sample()
+
+                if input_len_filter is not None and input_len > input_len_filter:
+                    continue
+                requests.append(
+                    {
+                        "timestamp": int(timestamp),
+                        "input_length": int(input_len),
+                        "output_length": int(output_len),
+                        "hash_ids": path,
+                        "context_len": int(context_len),
+                        "unique_user_prompt_len": int(input_len - context_len),
+                    }
+                )
+                request_id += 1
+                if request_id >= num_requests:
+                    break
+
+            timestamp += round(self.timedeltas_sampler.sample() / self.speedup_ratio)
+
+        # Adjust hash_ids if num_copies > 1
+        if self.num_copies > 1:
+            for request in requests:
+                offset = (np.random.randint(0, self.num_copies)) * (
+                    self.max_hash_id + 1
+                )
+                request["hash_ids"] = [
+                    int(hash_id + offset) for hash_id in request["hash_ids"]
+                ]
+
+        return requests
+
+    def __repr__(self):
+        path_lengths = nx.single_source_shortest_path_length(self.G, -1)
+        core_radix_tree_depth = max(path_lengths.values()) if path_lengths else 0
+
+        rep = "MooncakeSynth("
+        rep += f"core_radix_tree_size={len(self.G)}, "
+        rep += f"core_radix_tree_depth={core_radix_tree_depth}, "
+        rep += f"block_size={self.block_size})"
+
+        children = list(self.G.successors(-1))
+
+        data = {
+            "Child Node": children,
+            "Visited Count": [self.G.nodes[child]["visited"] for child in children],
+            "Length": [self.G.nodes[child].get("length", "N/A") for child in children],
+        }
+        df = pd.DataFrame(data)
+        df = df[df["Visited Count"] >= 5]
+        df = df.sort_values("Visited Count", ascending=False)
+        grouped = df.groupby("Length", sort=True)
+
+        rep += "\nRoot children (grouped by length, visited count ≥ 5):\n"
+        # Print each group in a minimal format - just length and visit counts
+        for length, group in grouped:
+            # Get top 5 nodes by visited count
+            top_nodes = group.head(5)
+
+            # Extract just the visit counts
+            visit_counts = top_nodes["Visited Count"].tolist()
+
+            # Format as a single line with just length and visit counts
+            rep += f"\nLength: {length}, Visited Counts: {visit_counts}"
+
+        return rep
+
+
+if __name__ == "__main__":
+    import argparse
+    from pathlib import Path
+
+    parser = argparse.ArgumentParser(description="Synthesize Mooncake-Esque dataset")
+    parser.add_argument(
+        "--input-file",
+        default="../datasets/mooncake_trace.jsonl",
+        type=str,
+        help="Path to the input CSV file",
+    )
+    parser.add_argument(
+        "--num-requests",
+        type=int,
+        default=int(1e5),
+        help="Number of requests to synthesize (default: 100000)",
+    )
+    parser.add_argument(
+        "--speedup-ratio",
+        type=float,
+        default=1,
+        help="Factor to speed up request intervals (default: 1)",
+    )
+    parser.add_argument(
+        "--depth-multiplier",
+        type=float,
+        default=1.0,
+        help="Multiplier for prefix lengths (default: 1.0)",
+    )
+    parser.add_argument(
+        "--width-multiplier",
+        type=int,
+        default=1,
+        help="Number of times to replicate the core radix tree (default: 1)",
+    )
+    parser.add_argument(
+        "--prompt-len-multiplier",
+        type=float,
+        default=1.0,
+        help="Multiplier for leaf path lengths (default: 1.0, use <1 for shorter prompts)",
+    )
+    parser.add_argument(
+        "--max-isl",
+        type=int,
+        default=None,
+        help="Maximum input sequence length to include in output (default: None, no filtering)",
+    )
+    parser.add_argument(
+        "--block-size",
+        type=int,
+        default=512,
+        help="Block size for prefilling and decoding (default: 512)",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        default=None,
+        help="Path to the output file (default: None, no output)",
+    )
+    args = parser.parse_args()
+
+    dataset_file = Path(args.input_file).resolve()
+    if args.output_file is None:
+        output_file = dataset_file.with_stem(
+            f"{dataset_file.stem}_synthesized_{int(args.depth_multiplier)}x{args.width_multiplier}+{args.prompt_len_multiplier}+{args.speedup_ratio}+{args.max_isl}"
+        )
+    else:
+        output_file = Path(args.output_file).resolve()
+
+    print("learning from dataset...", flush=True)
+    synthesizer = Synthesizer(
+        dataset_file,
+        block_size=args.block_size,
+        speedup_ratio=args.speedup_ratio,
+        context_len_multiplier=args.depth_multiplier,
+        num_copies=args.width_multiplier,
+        prompt_len_multiplier=args.prompt_len_multiplier,
+    )
+
+    print("synthesizing requests...", flush=True)
+    requests = synthesizer.synthesize_requests(args.num_requests, args.max_isl)
+    print(f"synthesized {len(requests)} requests")
+
+    # Print statistics in a single table with metrics as rows and statistics as columns
+    print("\n###### Synthesized Statistics ######")
+
+    # Extract all values first
+    metrics = {
+        "ISL": [req["input_length"] for req in requests],
+        "Context Length": [req["context_len"] for req in requests],
+        "Unique Prompt Length": [req["unique_user_prompt_len"] for req in requests],
+        "OSL": [req["output_length"] for req in requests],
+    }
+
+    # Initialize lists to store the data
+    metric_names = []
+    stats_data = []
+
+    # Calculate statistics for each metric
+    calculate_and_print_statistics(metrics)
+
+    with open(output_file, "w") as f:
+        for request in requests:
+            f.write(json.dumps(request) + "\n")
+    print(f"synthetic dataset saved at {Path(output_file).resolve()}")
diff --git a/benchmark/tests/__init__.py b/benchmark/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/benchmark/tests/test_sampler.py b/benchmark/tests/test_sampler.py
new file mode 100644
index 0000000000..5d038f46f5
--- /dev/null
+++ b/benchmark/tests/test_sampler.py
@@ -0,0 +1,31 @@
+import numpy as np
+from collections import Counter
+
+from benchmark.data_synth.sampler import EmpiricalSampler
+
+
+def test_empirical_sampler_distribution():
+    # Create a test array with equal numbers of 1, 2, and 3
+    test_data = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
+
+    # Create the sampler
+    sampler = EmpiricalSampler(test_data)
+
+    # Sample 1000 times
+    samples = [sampler.sample() for _ in range(1000)]
+
+    # Count occurrences of each value
+    counts = Counter(samples)
+
+    # Verify each number (1, 2, 3) appears between 300 and 400 times
+    for value in [1, 2, 3]:
+        assert (
+            300 <= counts[value] <= 400
+        ), f"Value {value} appeared {counts[value]} times, expected 300-400 times"
+
+    # Verify no other values appear in the samples
+    assert set(counts.keys()) == {
+        1,
+        2,
+        3,
+    }, f"Unexpected values in samples: {set(counts.keys()) - {1, 2, 3}}"
diff --git a/benchmark/tests/test_syntheiszer.py b/benchmark/tests/test_syntheiszer.py
new file mode 100644
index 0000000000..fd905aa081
--- /dev/null
+++ b/benchmark/tests/test_syntheiszer.py
@@ -0,0 +1,66 @@
+import json
+import os
+import random
+import tempfile
+import unittest
+
+from benchmark.data_synth.synthesizer import Synthesizer
+
+
+# Helper function to create and dump data
+def dump_record(handle, timestamp, hash_ids, block_size=512):
+    input_length = block_size * len(hash_ids)
+    output_length = random.randint(50, 250)
+
+    data = {
+        "timestamp": timestamp,
+        "hash_ids": hash_ids,
+        "input_length": input_length,
+        "output_length": output_length,
+    }
+    json.dump(data, handle)
+    handle.write("\n")
+
+
+class TestSynthesizer(unittest.TestCase):
+    def test_graph_structure(self):
+        # Create a temporary JSONL file with the specified data
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".jsonl", delete=False
+        ) as tmp:
+            dump_record(tmp, 1000, [0, 1, 2, 3, 4])
+            dump_record(tmp, 2000, [0, 1, 2])
+
+        # Create the Synthesizer with the temporary file
+        synthesizer = Synthesizer(tmp.name, block_size=512)
+
+        # Verify the graph structure
+        # Check that the root node (-1) has only one child
+        root_successors = list(synthesizer.G.successors(-1))
+        self.assertEqual(
+            len(root_successors), 1, "Root node should have exactly one child"
+        )
+
+        # Verify that the child is node 2 with length 3
+        self.assertEqual(
+            root_successors[0],
+            2,
+            f"Root's child should be node 2, but is {root_successors[0]}",
+        )
+        self.assertEqual(
+            synthesizer.G.nodes[2]["length"], 3, "Node 2 should have length 3"
+        )
+
+        # Verify the edge weight from root to child is 2
+        self.assertEqual(
+            synthesizer.G[-1][2]["weight"],
+            2,
+            "Edge weight from root to node 2 should be 2",
+        )
+
+        # Clean up
+        os.unlink(tmp.name)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/benchmark/utils/__init__.py b/benchmark/utils/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/benchmark/utils/logging.py b/benchmark/utils/logging.py
new file mode 100644
index 0000000000..7317858ee0
--- /dev/null
+++ b/benchmark/utils/logging.py
@@ -0,0 +1,38 @@
+from typing import Dict, List, Any
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+
+
+def calculate_and_print_statistics(metrics: Dict[str, List[float]]) -> pd.DataFrame:
+    """
+    Calculate statistics for a dictionary of metrics and print them in a tabular format.
+
+    Args:
+        metrics: Dictionary where keys are metric names and values are lists of metric values
+
+    Returns:
+        pandas.DataFrame: DataFrame containing the calculated statistics
+    """
+    metric_names = []
+    stats_data = []
+
+    # Calculate statistics for each metric
+    for metric_name, values in metrics.items():
+        metric_names.append(metric_name)
+        stats_data.append(
+            {
+                "Mean": np.mean(values),
+                "Std Dev": np.std(values),
+                "Min": np.min(values),
+                "P25": np.percentile(values, 25),
+                "Median": np.median(values),
+                "P75": np.percentile(values, 75),
+                "Max": np.max(values),
+            }
+        )
+
+    stats_df = pd.DataFrame(stats_data, index=metric_names)
+    print(tabulate(stats_df.round(2), headers="keys", tablefmt="pretty"))
+
+    return stats_df
diff --git a/container/deps/requirements.txt b/container/deps/requirements.txt
index 2546f65f85..c61b8440d0 100644
--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -21,6 +21,7 @@ httpx
 kubernetes==32.0.1
 msgspec
 mypy
+networkx
 numpy
 opentelemetry-api
 opentelemetry-sdk

From 9e69e5015824755f21b86525e66e18081a6f17b9 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 14:28:06 -0700
Subject: [PATCH 02/50] benchmarks with an s

---
 {benchmark => benchmarks}/__init__.py                   | 0
 {benchmark => benchmarks}/data_synth/README.md          | 0
 {benchmark => benchmarks}/data_synth/__init__.py        | 0
 {benchmark => benchmarks}/data_synth/graph_utils.py     | 2 +-
 {benchmark => benchmarks}/data_synth/prefix_analyzer.py | 0
 {benchmark => benchmarks}/data_synth/sampler.py         | 0
 {benchmark => benchmarks}/data_synth/synthesizer.py     | 6 +++---
 {benchmark => benchmarks}/tests/__init__.py             | 0
 {benchmark => benchmarks}/tests/test_sampler.py         | 2 +-
 {benchmark => benchmarks}/tests/test_syntheiszer.py     | 4 +++-
 {benchmark => benchmarks}/utils/__init__.py             | 0
 {benchmark => benchmarks}/utils/logging.py              | 0
 12 files changed, 8 insertions(+), 6 deletions(-)
 rename {benchmark => benchmarks}/__init__.py (100%)
 rename {benchmark => benchmarks}/data_synth/README.md (100%)
 rename {benchmark => benchmarks}/data_synth/__init__.py (100%)
 rename {benchmark => benchmarks}/data_synth/graph_utils.py (98%)
 rename {benchmark => benchmarks}/data_synth/prefix_analyzer.py (100%)
 rename {benchmark => benchmarks}/data_synth/sampler.py (100%)
 rename {benchmark => benchmarks}/data_synth/synthesizer.py (98%)
 rename {benchmark => benchmarks}/tests/__init__.py (100%)
 rename {benchmark => benchmarks}/tests/test_sampler.py (93%)
 rename {benchmark => benchmarks}/tests/test_syntheiszer.py (95%)
 rename {benchmark => benchmarks}/utils/__init__.py (100%)
 rename {benchmark => benchmarks}/utils/logging.py (100%)

diff --git a/benchmark/__init__.py b/benchmarks/__init__.py
similarity index 100%
rename from benchmark/__init__.py
rename to benchmarks/__init__.py
diff --git a/benchmark/data_synth/README.md b/benchmarks/data_synth/README.md
similarity index 100%
rename from benchmark/data_synth/README.md
rename to benchmarks/data_synth/README.md
diff --git a/benchmark/data_synth/__init__.py b/benchmarks/data_synth/__init__.py
similarity index 100%
rename from benchmark/data_synth/__init__.py
rename to benchmarks/data_synth/__init__.py
diff --git a/benchmark/data_synth/graph_utils.py b/benchmarks/data_synth/graph_utils.py
similarity index 98%
rename from benchmark/data_synth/graph_utils.py
rename to benchmarks/data_synth/graph_utils.py
index 4437a12e92..b321cb8325 100644
--- a/benchmark/data_synth/graph_utils.py
+++ b/benchmarks/data_synth/graph_utils.py
@@ -1,7 +1,7 @@
 import numpy as np
 import networkx as nx
 
-from benchmark.data_synth.sampler import get_cdf
+from benchmarks.data_synth.sampler import get_cdf
 
 
 def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
diff --git a/benchmark/data_synth/prefix_analyzer.py b/benchmarks/data_synth/prefix_analyzer.py
similarity index 100%
rename from benchmark/data_synth/prefix_analyzer.py
rename to benchmarks/data_synth/prefix_analyzer.py
diff --git a/benchmark/data_synth/sampler.py b/benchmarks/data_synth/sampler.py
similarity index 100%
rename from benchmark/data_synth/sampler.py
rename to benchmarks/data_synth/sampler.py
diff --git a/benchmark/data_synth/synthesizer.py b/benchmarks/data_synth/synthesizer.py
similarity index 98%
rename from benchmark/data_synth/synthesizer.py
rename to benchmarks/data_synth/synthesizer.py
index 92f2202655..a797fcc904 100644
--- a/benchmark/data_synth/synthesizer.py
+++ b/benchmarks/data_synth/synthesizer.py
@@ -5,13 +5,13 @@
 import numpy as np
 import pandas as pd
 
-from benchmark.utils.logging import calculate_and_print_statistics
-from benchmark.data_synth.graph_utils import (
+from benchmarks.utils.logging import calculate_and_print_statistics
+from benchmarks.data_synth.graph_utils import (
     _merge_chains,
     _precompute_transition_cdfs,
     _remove_leaves,
 )
-from benchmark.data_synth.sampler import EmpiricalSampler, sample_from_cdf
+from benchmarks.data_synth.sampler import EmpiricalSampler, sample_from_cdf
 
 
 class Synthesizer:
diff --git a/benchmark/tests/__init__.py b/benchmarks/tests/__init__.py
similarity index 100%
rename from benchmark/tests/__init__.py
rename to benchmarks/tests/__init__.py
diff --git a/benchmark/tests/test_sampler.py b/benchmarks/tests/test_sampler.py
similarity index 93%
rename from benchmark/tests/test_sampler.py
rename to benchmarks/tests/test_sampler.py
index 5d038f46f5..3e2a2b2f39 100644
--- a/benchmark/tests/test_sampler.py
+++ b/benchmarks/tests/test_sampler.py
@@ -1,7 +1,7 @@
 import numpy as np
 from collections import Counter
 
-from benchmark.data_synth.sampler import EmpiricalSampler
+from benchmarks.data_synth.sampler import EmpiricalSampler
 
 
 def test_empirical_sampler_distribution():
diff --git a/benchmark/tests/test_syntheiszer.py b/benchmarks/tests/test_syntheiszer.py
similarity index 95%
rename from benchmark/tests/test_syntheiszer.py
rename to benchmarks/tests/test_syntheiszer.py
index fd905aa081..46cb29395e 100644
--- a/benchmark/tests/test_syntheiszer.py
+++ b/benchmarks/tests/test_syntheiszer.py
@@ -4,7 +4,7 @@
 import tempfile
 import unittest
 
-from benchmark.data_synth.synthesizer import Synthesizer
+from benchmarks.data_synth.synthesizer import Synthesizer
 
 
 # Helper function to create and dump data
@@ -33,6 +33,8 @@ def test_graph_structure(self):
 
         # Create the Synthesizer with the temporary file
         synthesizer = Synthesizer(tmp.name, block_size=512)
+        
+        print(synthesizer)
 
         # Verify the graph structure
         # Check that the root node (-1) has only one child
diff --git a/benchmark/utils/__init__.py b/benchmarks/utils/__init__.py
similarity index 100%
rename from benchmark/utils/__init__.py
rename to benchmarks/utils/__init__.py
diff --git a/benchmark/utils/logging.py b/benchmarks/utils/logging.py
similarity index 100%
rename from benchmark/utils/logging.py
rename to benchmarks/utils/logging.py

From 08376194e2ebe072aceda7792d69789e49591cc8 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 14:42:31 -0700
Subject: [PATCH 03/50] decrement 1 for core radix tree size

---
 benchmarks/data_synth/synthesizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/data_synth/synthesizer.py b/benchmarks/data_synth/synthesizer.py
index a797fcc904..8245a6be06 100644
--- a/benchmarks/data_synth/synthesizer.py
+++ b/benchmarks/data_synth/synthesizer.py
@@ -292,10 +292,11 @@ def synthesize_requests(self, num_requests, input_len_filter=None):
 
     def __repr__(self):
         path_lengths = nx.single_source_shortest_path_length(self.G, -1)
+        core_radix_tree_size = len(self.G) - 1
         core_radix_tree_depth = max(path_lengths.values()) if path_lengths else 0
 
         rep = "MooncakeSynth("
-        rep += f"core_radix_tree_size={len(self.G)}, "
+        rep += f"core_radix_tree_size={core_radix_tree_size}, "
         rep += f"core_radix_tree_depth={core_radix_tree_depth}, "
         rep += f"block_size={self.block_size})"
 

From 69b3971ef03f9445da0863991048fcea066cee45 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 14:43:00 -0700
Subject: [PATCH 04/50] rename to data_utils

---
 benchmarks/{data_synth => data_utils}/README.md          | 0
 benchmarks/{data_synth => data_utils}/__init__.py        | 0
 benchmarks/{data_synth => data_utils}/graph_utils.py     | 2 +-
 benchmarks/{data_synth => data_utils}/prefix_analyzer.py | 0
 benchmarks/{data_synth => data_utils}/sampler.py         | 0
 benchmarks/{data_synth => data_utils}/synthesizer.py     | 4 ++--
 6 files changed, 3 insertions(+), 3 deletions(-)
 rename benchmarks/{data_synth => data_utils}/README.md (100%)
 rename benchmarks/{data_synth => data_utils}/__init__.py (100%)
 rename benchmarks/{data_synth => data_utils}/graph_utils.py (98%)
 rename benchmarks/{data_synth => data_utils}/prefix_analyzer.py (100%)
 rename benchmarks/{data_synth => data_utils}/sampler.py (100%)
 rename benchmarks/{data_synth => data_utils}/synthesizer.py (99%)

diff --git a/benchmarks/data_synth/README.md b/benchmarks/data_utils/README.md
similarity index 100%
rename from benchmarks/data_synth/README.md
rename to benchmarks/data_utils/README.md
diff --git a/benchmarks/data_synth/__init__.py b/benchmarks/data_utils/__init__.py
similarity index 100%
rename from benchmarks/data_synth/__init__.py
rename to benchmarks/data_utils/__init__.py
diff --git a/benchmarks/data_synth/graph_utils.py b/benchmarks/data_utils/graph_utils.py
similarity index 98%
rename from benchmarks/data_synth/graph_utils.py
rename to benchmarks/data_utils/graph_utils.py
index b321cb8325..e23c088eea 100644
--- a/benchmarks/data_synth/graph_utils.py
+++ b/benchmarks/data_utils/graph_utils.py
@@ -1,7 +1,7 @@
 import numpy as np
 import networkx as nx
 
-from benchmarks.data_synth.sampler import get_cdf
+from benchmarks.data_utils.sampler import get_cdf
 
 
 def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
diff --git a/benchmarks/data_synth/prefix_analyzer.py b/benchmarks/data_utils/prefix_analyzer.py
similarity index 100%
rename from benchmarks/data_synth/prefix_analyzer.py
rename to benchmarks/data_utils/prefix_analyzer.py
diff --git a/benchmarks/data_synth/sampler.py b/benchmarks/data_utils/sampler.py
similarity index 100%
rename from benchmarks/data_synth/sampler.py
rename to benchmarks/data_utils/sampler.py
diff --git a/benchmarks/data_synth/synthesizer.py b/benchmarks/data_utils/synthesizer.py
similarity index 99%
rename from benchmarks/data_synth/synthesizer.py
rename to benchmarks/data_utils/synthesizer.py
index 8245a6be06..c392f74f4e 100644
--- a/benchmarks/data_synth/synthesizer.py
+++ b/benchmarks/data_utils/synthesizer.py
@@ -6,12 +6,12 @@
 import pandas as pd
 
 from benchmarks.utils.logging import calculate_and_print_statistics
-from benchmarks.data_synth.graph_utils import (
+from benchmarks.data_utils.graph_utils import (
     _merge_chains,
     _precompute_transition_cdfs,
     _remove_leaves,
 )
-from benchmarks.data_synth.sampler import EmpiricalSampler, sample_from_cdf
+from benchmarks.data_utils.sampler import EmpiricalSampler, sample_from_cdf
 
 
 class Synthesizer:

From b4341d971a469282e4d5863ea0032913b8571c5a Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 14:44:23 -0700
Subject: [PATCH 05/50] change paths in tests

---
 benchmarks/tests/test_sampler.py     | 2 +-
 benchmarks/tests/test_syntheiszer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/tests/test_sampler.py b/benchmarks/tests/test_sampler.py
index 3e2a2b2f39..45879136a5 100644
--- a/benchmarks/tests/test_sampler.py
+++ b/benchmarks/tests/test_sampler.py
@@ -1,7 +1,7 @@
 import numpy as np
 from collections import Counter
 
-from benchmarks.data_synth.sampler import EmpiricalSampler
+from benchmarks.data_utils.sampler import EmpiricalSampler
 
 
 def test_empirical_sampler_distribution():
diff --git a/benchmarks/tests/test_syntheiszer.py b/benchmarks/tests/test_syntheiszer.py
index 46cb29395e..260afbc13b 100644
--- a/benchmarks/tests/test_syntheiszer.py
+++ b/benchmarks/tests/test_syntheiszer.py
@@ -4,7 +4,7 @@
 import tempfile
 import unittest
 
-from benchmarks.data_synth.synthesizer import Synthesizer
+from benchmarks.data_utils.synthesizer import Synthesizer
 
 
 # Helper function to create and dump data

From c0a4b3c160fd748495253101c1f69f0a1120e41a Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 15:19:51 -0700
Subject: [PATCH 06/50] fix edge case of contraction (with -1 head), and added
 type hints

---
 benchmarks/data_utils/README.md      |  5 +--
 benchmarks/data_utils/graph_utils.py |  2 +-
 benchmarks/data_utils/synthesizer.py | 30 ++++++-------
 benchmarks/tests/test_syntheiszer.py | 67 +++++++++++-----------------
 container/deps/requirements.txt      |  2 +
 5 files changed, 46 insertions(+), 60 deletions(-)

diff --git a/benchmarks/data_utils/README.md b/benchmarks/data_utils/README.md
index 375331a98e..d21f4e3a63 100644
--- a/benchmarks/data_utils/README.md
+++ b/benchmarks/data_utils/README.md
@@ -1,4 +1,4 @@
-This directory is currently used for generate synthetic data based on the mooncake dataset, but should be easily extendible to any request datasets with (prefix) hash ids.
+This directory is currently used for generate synthetic data based on the mooncake dataset, but should be easily extendible to any request datasets with (prefix) hash ids, with a current cavaet. The synthesizer is designed to work for jsonl files in the "mooncake" trace file format, meaning that the input are increasing integers of block hashes. For now, new block hashes must be the next consecutive integer, otherwise will not work.
 
 ## Quickstart
 
@@ -13,9 +13,6 @@ python -m benchmark.data_synth.synthesizer \
 ```
 where `num-requests` sets the number of total synthetic requests generated, `speedup-ratio` tunes the rate at which the requests are sent, `depth-multiplier` tunes the lengths of the request prefixes (higher multiplier will then yield longer ISLs), and `width-multiplier` controls the branching factor of the synthetic requests (higher multiplier will generate more diverse request patterns).
 
-
-The synthesizer is designed to work for jsonl files in the "mooncake" trace file format, meaning that the input are increasing integers of block hashes. For now, new block hashes must be the next consecutive integer, otherwise will not work.
-
 ## How it works
 
 The generation algorithm, simplified, is as follows
diff --git a/benchmarks/data_utils/graph_utils.py b/benchmarks/data_utils/graph_utils.py
index e23c088eea..2fcc822f63 100644
--- a/benchmarks/data_utils/graph_utils.py
+++ b/benchmarks/data_utils/graph_utils.py
@@ -33,7 +33,7 @@ def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
         for node in chain_nodes:
             node_pred = list(G.predecessors(node))[0]
             # find the parent node source
-            if G.nodes[node_pred]["visited"] == visited:
+            if G.nodes[node_pred]["visited"] == visited and node_pred != -1:
                 continue
             weight = G[node_pred][node]["weight"]
 
diff --git a/benchmarks/data_utils/synthesizer.py b/benchmarks/data_utils/synthesizer.py
index c392f74f4e..4fdf07b837 100644
--- a/benchmarks/data_utils/synthesizer.py
+++ b/benchmarks/data_utils/synthesizer.py
@@ -4,6 +4,7 @@
 import networkx as nx
 import numpy as np
 import pandas as pd
+from typing import Optional
 
 from benchmarks.utils.logging import calculate_and_print_statistics
 from benchmarks.data_utils.graph_utils import (
@@ -116,16 +117,15 @@ def __init__(
         self.G.nodes[-1]["visited"] = num_paths
         self.max_hash_id = max_hash_id
 
-        nodes_with_multiple_parents = [
-            (node, d) for node, d in self.G.in_degree() if d > 1
-        ]
-        if nodes_with_multiple_parents:
-            print("WARNING: The following nodes have multiple parents (in-degree > 1):")
-            for node, in_degree in nodes_with_multiple_parents:
+        invalid_nodes = [(node, d) for node, d in self.G.in_degree() if d > 1]
+        if invalid_nodes:
+            print("ERROR: The following nodes have multiple parents (in-degree > 1):")
+            for node, in_degree in invalid_nodes:
                 parents = list(self.G.predecessors(node))
                 print(f"  Node {node}: in-degree={in_degree}, parents={parents}")
-
-        assert all(d <= 1 for _, d in self.G.in_degree()), "Graph is not a tree"
+            raise ValueError(
+                "Graph is not a valid tree: nodes with multiple parents detected"
+            )
 
         # visits to leaf nodes (non-core branches) are considered as ended
         for node in self.G.nodes():
@@ -168,9 +168,7 @@ def __init__(
         self.input_lens_mod_sampler = EmpiricalSampler(input_lens_mod)
         self.output_lens_sampler = EmpiricalSampler(output_lens)
 
-        print(self)
-
-    def _relabel_nodes(self):
+    def _relabel_nodes(self) -> None:
         # Scale node labels by length multiplier if needed
         if self.context_len_multiplier > 1:
             multiplier = int(np.ceil(self.context_len_multiplier))
@@ -191,7 +189,7 @@ def _relabel_nodes(self):
                     round(self.G.nodes[node]["length"] * self.context_len_multiplier), 1
                 )
 
-    def _synthesize_leaf_path(self):
+    def _synthesize_leaf_path(self) -> list[int]:
         # Sample the leaf path length
         leaf_length = self.leaves_lens_sampler.sample()
 
@@ -203,7 +201,7 @@ def _synthesize_leaf_path(self):
 
         return path
 
-    def synthesize_path(self):
+    def synthesize_path(self) -> tuple[list[int], bool, int]:
         # Start from root node (-1)
         current_node = -1
         path = []
@@ -241,7 +239,9 @@ def synthesize_path(self):
         # Append a leaf path at the end
         return path + unique_user_prompt, True, context_len
 
-    def synthesize_requests(self, num_requests, input_len_filter=None):
+    def synthesize_requests(
+        self, num_requests: int, input_len_filter: Optional[int] = None
+    ) -> list[dict[str, any]]:
         timestamp = 0
 
         requests = []
@@ -290,7 +290,7 @@ def synthesize_requests(self, num_requests, input_len_filter=None):
 
         return requests
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         path_lengths = nx.single_source_shortest_path_length(self.G, -1)
         core_radix_tree_size = len(self.G) - 1
         core_radix_tree_depth = max(path_lengths.values()) if path_lengths else 0
diff --git a/benchmarks/tests/test_syntheiszer.py b/benchmarks/tests/test_syntheiszer.py
index 260afbc13b..8513c73f8a 100644
--- a/benchmarks/tests/test_syntheiszer.py
+++ b/benchmarks/tests/test_syntheiszer.py
@@ -22,46 +22,33 @@ def dump_record(handle, timestamp, hash_ids, block_size=512):
     handle.write("\n")
 
 
-class TestSynthesizer(unittest.TestCase):
-    def test_graph_structure(self):
-        # Create a temporary JSONL file with the specified data
-        with tempfile.NamedTemporaryFile(
-            mode="w", suffix=".jsonl", delete=False
-        ) as tmp:
-            dump_record(tmp, 1000, [0, 1, 2, 3, 4])
-            dump_record(tmp, 2000, [0, 1, 2])
-
-        # Create the Synthesizer with the temporary file
-        synthesizer = Synthesizer(tmp.name, block_size=512)
-        
-        print(synthesizer)
-
-        # Verify the graph structure
-        # Check that the root node (-1) has only one child
-        root_successors = list(synthesizer.G.successors(-1))
-        self.assertEqual(
-            len(root_successors), 1, "Root node should have exactly one child"
-        )
-
-        # Verify that the child is node 2 with length 3
-        self.assertEqual(
-            root_successors[0],
-            2,
-            f"Root's child should be node 2, but is {root_successors[0]}",
-        )
-        self.assertEqual(
-            synthesizer.G.nodes[2]["length"], 3, "Node 2 should have length 3"
-        )
-
-        # Verify the edge weight from root to child is 2
-        self.assertEqual(
-            synthesizer.G[-1][2]["weight"],
-            2,
-            "Edge weight from root to node 2 should be 2",
-        )
-
-        # Clean up
-        os.unlink(tmp.name)
+def test_graph_structure():
+    # Create a temporary JSONL file with the specified data
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as tmp:
+        dump_record(tmp, 1000, [0, 1, 2, 3, 4])
+        dump_record(tmp, 2000, [0, 1, 2])
+
+    # Create the Synthesizer with the temporary file
+    synthesizer = Synthesizer(tmp.name, block_size=512)
+
+    # Verify the graph structure
+    # Check that the root node (-1) has only one child
+    root_successors = list(synthesizer.G.successors(-1))
+    assert len(root_successors) == 1, "Root node should have exactly one child"
+
+    # Verify that the child is node 2 with length 3
+    assert (
+        root_successors[0] == 2
+    ), f"Root's child should be node 2, but is {root_successors[0]}"
+    assert synthesizer.G.nodes[2]["length"] == 3, "Node 2 should have length 3"
+
+    # Verify the edge weight from root to child is 2
+    assert (
+        synthesizer.G[-1][2]["weight"] == 2
+    ), "Edge weight from root to node 2 should be 2"
+
+    # Clean up
+    os.unlink(tmp.name)
 
 
 if __name__ == "__main__":
diff --git a/container/deps/requirements.txt b/container/deps/requirements.txt
index c61b8440d0..e4efcae387 100644
--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -25,6 +25,7 @@ networkx
 numpy
 opentelemetry-api
 opentelemetry-sdk
+pandas
 pip==25.0.1
 pre-commit
 protobuf==5.27.3
@@ -32,6 +33,7 @@ pydantic==2.7.1
 pyright
 PyYAML
 sentencepiece
+tabulate
 tensorboard==2.19.0
 tensorboardX==2.6.2.2
 transformers

From 25ebbb69a193d2d6a40fa0622271a735eecb0150 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 16:25:30 -0700
Subject: [PATCH 07/50] more stringent graph structure testing

---
 benchmarks/data_utils/sampler.py     | 13 +++++-
 benchmarks/tests/test_syntheiszer.py | 68 ++++++++++++++++++++--------
 2 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/benchmarks/data_utils/sampler.py b/benchmarks/data_utils/sampler.py
index f5f9457b44..9356ee5dca 100644
--- a/benchmarks/data_utils/sampler.py
+++ b/benchmarks/data_utils/sampler.py
@@ -1,9 +1,12 @@
+import logging
 from collections import Counter
-from typing import List, Tuple, Dict, Any, Union, Optional
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from numpy.random import Generator
 
+logger = logging.getLogger(__name__)
+
 
 def get_cdf(weights: List[float]) -> np.ndarray:
     cumsum = np.cumsum(weights)
@@ -32,7 +35,13 @@ def sample_from_cdf(
 class EmpiricalSampler:
     def __init__(self, data: Union[List[Any], np.ndarray]) -> None:
         self.rng = np.random.default_rng(0)
-        self.data, self.cdf = data_to_cdf(np.array(data))
+        self.empty_data = len(data) == 0
+        if self.empty_data:
+            logger.warning("Empty data provided to EmpiricalSampler")
+        else:
+            self.data, self.cdf = data_to_cdf(np.array(data))
 
     def sample(self) -> Any:
+        if self.empty_data:
+            return 0
         return sample_from_cdf(self.data, self.cdf, self.rng)
diff --git a/benchmarks/tests/test_syntheiszer.py b/benchmarks/tests/test_syntheiszer.py
index 8513c73f8a..c5cb9b1e6a 100644
--- a/benchmarks/tests/test_syntheiszer.py
+++ b/benchmarks/tests/test_syntheiszer.py
@@ -8,12 +8,12 @@
 
 
 # Helper function to create and dump data
-def dump_record(handle, timestamp, hash_ids, block_size=512):
+def dump_record(handle, hash_ids, block_size=512):
     input_length = block_size * len(hash_ids)
     output_length = random.randint(50, 250)
 
     data = {
-        "timestamp": timestamp,
+        "timestamp": 1000,
         "hash_ids": hash_ids,
         "input_length": input_length,
         "output_length": output_length,
@@ -22,30 +22,60 @@ def dump_record(handle, timestamp, hash_ids, block_size=512):
     handle.write("\n")
 
 
+def check_attributes(
+    graph,
+    node,
+    expected_children,
+    expected_visited=None,
+    expected_length=None,
+    expected_to_leaf=None,
+):
+    # Check children
+    actual_children = list(graph.successors(node))
+    assert sorted(actual_children) == sorted(
+        expected_children
+    ), f"Node {node} has children {actual_children}, expected {expected_children}"
+
+    # Check 'visited' attribute if expected
+    if expected_visited is not None:
+        assert (
+            graph.nodes[node].get("visited") == expected_visited
+        ), f"Node {node} has 'visited' value {graph.nodes[node].get('visited')}, expected {expected_visited}"
+
+    # Check 'length' attribute if expected
+    if expected_length is not None:
+        assert (
+            graph.nodes[node].get("length") == expected_length
+        ), f"Node {node} has 'length' value {graph.nodes[node].get('length')}, expected {expected_length}"
+
+    # Check 'to_leaf' attribute if expected
+    if expected_to_leaf is not None:
+        assert (
+            graph.nodes[node].get("to_leaf") == expected_to_leaf
+        ), f"Node {node} has 'to_leaf' value {graph.nodes[node].get('to_leaf')}, expected {expected_to_leaf}"
+
+    return True
+
+
 def test_graph_structure():
     # Create a temporary JSONL file with the specified data
     with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as tmp:
-        dump_record(tmp, 1000, [0, 1, 2, 3, 4])
-        dump_record(tmp, 2000, [0, 1, 2])
+        dump_record(tmp, [0, 1])
+        dump_record(tmp, [0, 1, 2, 3, 4])
+        dump_record(tmp, [0, 1, 2, 3, 4, 5, 6])
+        dump_record(tmp, [7, 8])
+        dump_record(tmp, [7, 8, 9, 10])
+        dump_record(tmp, [11, 12])
 
     # Create the Synthesizer with the temporary file
     synthesizer = Synthesizer(tmp.name, block_size=512)
-
+    G = synthesizer.G
+    
     # Verify the graph structure
-    # Check that the root node (-1) has only one child
-    root_successors = list(synthesizer.G.successors(-1))
-    assert len(root_successors) == 1, "Root node should have exactly one child"
-
-    # Verify that the child is node 2 with length 3
-    assert (
-        root_successors[0] == 2
-    ), f"Root's child should be node 2, but is {root_successors[0]}"
-    assert synthesizer.G.nodes[2]["length"] == 3, "Node 2 should have length 3"
-
-    # Verify the edge weight from root to child is 2
-    assert (
-        synthesizer.G[-1][2]["weight"] == 2
-    ), "Edge weight from root to node 2 should be 2"
+    check_attributes(G, -1, [1, 8], 6, None, 1)
+    check_attributes(G, 1, [4], 3, 2, 0)
+    check_attributes(G, 4, [], 2, 3, 1)
+    check_attributes(G, 8, [], 2, 2, 1)
 
     # Clean up
     os.unlink(tmp.name)

From 80ecb22c3041ca8216ee9670ee01fd04c063f628 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 17:10:08 -0700
Subject: [PATCH 08/50] cleanups

---
 benchmarks/data_utils/graph_utils.py |  8 +++--
 benchmarks/data_utils/protocols.py   |  7 ++++
 benchmarks/data_utils/synthesizer.py | 51 ++++++++++++++++------------
 benchmarks/tests/test_syntheiszer.py |  2 +-
 4 files changed, 43 insertions(+), 25 deletions(-)
 create mode 100644 benchmarks/data_utils/protocols.py

diff --git a/benchmarks/data_utils/graph_utils.py b/benchmarks/data_utils/graph_utils.py
index 2fcc822f63..82f50aa5b2 100644
--- a/benchmarks/data_utils/graph_utils.py
+++ b/benchmarks/data_utils/graph_utils.py
@@ -2,6 +2,7 @@
 import networkx as nx
 
 from benchmarks.data_utils.sampler import get_cdf
+from benchmarks.data_utils.protocols import SUPER_ROOT, CACHE_END, END_NODE
 
 
 def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
@@ -33,7 +34,7 @@ def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
         for node in chain_nodes:
             node_pred = list(G.predecessors(node))[0]
             # find the parent node source
-            if G.nodes[node_pred]["visited"] == visited and node_pred != -1:
+            if G.nodes[node_pred]["visited"] == visited and node_pred != SUPER_ROOT:
                 continue
             weight = G[node_pred][node]["weight"]
 
@@ -80,7 +81,10 @@ def _precompute_transition_cdfs(G: nx.DiGraph) -> nx.DiGraph:
             G.nodes[node]["end"],
         ]
         G.nodes[node]["out_cdf"] = get_cdf(weights)
-        G.nodes[node]["out_nodes"] = [edge[1] for edge in out_edges] + [-2, -3]
+        G.nodes[node]["out_nodes"] = [edge[1] for edge in out_edges] + [
+            CACHE_END,
+            END_NODE,
+        ]
 
     return G
 
diff --git a/benchmarks/data_utils/protocols.py b/benchmarks/data_utils/protocols.py
new file mode 100644
index 0000000000..edb4d3394c
--- /dev/null
+++ b/benchmarks/data_utils/protocols.py
@@ -0,0 +1,7 @@
+"""
+Protocol-level constants for synthetic data graph structure.
+"""
+
+SUPER_ROOT = -1  # Dummy node preceding all real nodes; not an actual data root
+CACHE_END = -2  # Special node indicating end of a path
+END_NODE = -3  # Special node indicating to skip leaf sampling
diff --git a/benchmarks/data_utils/synthesizer.py b/benchmarks/data_utils/synthesizer.py
index 4fdf07b837..954947e565 100644
--- a/benchmarks/data_utils/synthesizer.py
+++ b/benchmarks/data_utils/synthesizer.py
@@ -13,6 +13,7 @@
     _remove_leaves,
 )
 from benchmarks.data_utils.sampler import EmpiricalSampler, sample_from_cdf
+from benchmarks.data_utils.protocols import SUPER_ROOT, CACHE_END, END_NODE
 
 
 class Synthesizer:
@@ -66,7 +67,7 @@ def __init__(
         # assert correct arg bounds
         assert (
             isinstance(self.num_copies, int) and self.num_copies >= 1
-        ), "num_copies must be an integer greater than 1"
+        ), "num_copies must be an integer greater than or equal to 1"
         assert (
             isinstance(self.speedup_ratio, float) and self.speedup_ratio > 0
         ), "speedup_ratio must be a positive float"
@@ -81,24 +82,27 @@ def __init__(
 
         # extract data from json file
         with open(dataset_file, "r") as f:
-            hash_ids_list = [np.array(json.loads(line)["hash_ids"]) for line in f]
-        with open(dataset_file, "r") as f:
-            timestamps = [int(json.loads(line)["timestamp"]) for line in f]
-        with open(dataset_file, "r") as f:
-            input_lens = [np.array(json.loads(line)["input_length"]) for line in f]
-        with open(dataset_file, "r") as f:
-            output_lens = [int(json.loads(line)["output_length"]) for line in f]
+            hash_ids_list = []
+            timestamps = []
+            input_lens = []
+            output_lens = []
+            for line in f:
+                data = json.loads(line)
+                hash_ids_list.append(np.array(data["hash_ids"]))
+                timestamps.append(int(data["timestamp"]))
+                input_lens.append(np.array(data["input_length"]))
+                output_lens.append(int(data["output_length"]))
 
         # represent prefix-tree as directed graph
         self.G = nx.DiGraph()
-        max_hash_id = -1
+        max_hash_id = SUPER_ROOT
         num_paths = 0
 
         self.G.add_node(-1, end=0)
         for hash_ids in hash_ids_list:
             num_paths += 1
             for i in range(len(hash_ids)):
-                u = hash_ids[i - 1] if i > 0 else -1
+                u = hash_ids[i - 1] if i > 0 else SUPER_ROOT
                 v = hash_ids[i]
                 max_hash_id = max(v, max_hash_id)
 
@@ -114,7 +118,7 @@ def __init__(
 
             self.G.nodes[v]["end"] += 1
 
-        self.G.nodes[-1]["visited"] = num_paths
+        self.G.nodes[SUPER_ROOT]["visited"] = num_paths
         self.max_hash_id = max_hash_id
 
         invalid_nodes = [(node, d) for node, d in self.G.in_degree() if d > 1]
@@ -202,8 +206,18 @@ def _synthesize_leaf_path(self) -> list[int]:
         return path
 
     def synthesize_path(self) -> tuple[list[int], bool, int]:
+        """
+        Synthesizes a path through the core radix tree, optionally appending a unique user prompt (leaf path).
+
+        Returns:
+            tuple:
+                - list[int]: The full path as a list of hash_ids. This consists of the cached (core) hash_ids,
+                  with new unique hash_ids appended at the end if a leaf path is included.
+                - bool: Whether the path contains a leaf path (i.e., new unique hash_ids were appended).
+                - int: The context length, defined as the number of cached hash_ids multiplied by block_size.
+        """
         # Start from root node (-1)
-        current_node = -1
+        current_node = SUPER_ROOT
         path = []
         context_len = 0
 
@@ -217,10 +231,10 @@ def synthesize_path(self) -> tuple[list[int], bool, int]:
 
             # end early
             # break and start sampling unique user prompt
-            if next_node == -2:
+            if next_node == CACHE_END:
                 break
             # break and don't sample leaf
-            if next_node == -3:
+            if next_node == END_NODE:
                 return path, False, 0
 
             # otherwise continue down prefix tree
@@ -301,7 +315,6 @@ def __repr__(self) -> str:
         rep += f"block_size={self.block_size})"
 
         children = list(self.G.successors(-1))
-
         data = {
             "Child Node": children,
             "Visited Count": [self.G.nodes[child]["visited"] for child in children],
@@ -312,16 +325,10 @@ def __repr__(self) -> str:
         df = df.sort_values("Visited Count", ascending=False)
         grouped = df.groupby("Length", sort=True)
 
-        rep += "\nRoot children (grouped by length, visited count ≥ 5):\n"
-        # Print each group in a minimal format - just length and visit counts
+        rep += "\nRoot nodes (grouped by length, visited count ≥ 5):\n"
         for length, group in grouped:
-            # Get top 5 nodes by visited count
             top_nodes = group.head(5)
-
-            # Extract just the visit counts
             visit_counts = top_nodes["Visited Count"].tolist()
-
-            # Format as a single line with just length and visit counts
             rep += f"\nLength: {length}, Visited Counts: {visit_counts}"
 
         return rep
diff --git a/benchmarks/tests/test_syntheiszer.py b/benchmarks/tests/test_syntheiszer.py
index c5cb9b1e6a..4480c6617b 100644
--- a/benchmarks/tests/test_syntheiszer.py
+++ b/benchmarks/tests/test_syntheiszer.py
@@ -70,7 +70,7 @@ def test_graph_structure():
     # Create the Synthesizer with the temporary file
     synthesizer = Synthesizer(tmp.name, block_size=512)
     G = synthesizer.G
-    
+
     # Verify the graph structure
     check_attributes(G, -1, [1, 8], 6, None, 1)
     check_attributes(G, 1, [4], 3, 2, 0)

From b8179f20d6c95491466b03c06db54bf7183b8c5d Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 17:15:15 -0700
Subject: [PATCH 09/50] licenses

---
 benchmarks/__init__.py                   | 14 ++++++++++++++
 benchmarks/data_utils/README.md          | 17 ++++++++++++++++-
 benchmarks/data_utils/__init__.py        | 14 ++++++++++++++
 benchmarks/data_utils/graph_utils.py     | 15 +++++++++++++++
 benchmarks/data_utils/prefix_analyzer.py | 15 +++++++++++++++
 benchmarks/data_utils/protocols.py       | 15 +++++++++++++++
 benchmarks/data_utils/sampler.py         | 15 +++++++++++++++
 benchmarks/data_utils/synthesizer.py     | 15 +++++++++++++++
 benchmarks/tests/__init__.py             | 14 ++++++++++++++
 benchmarks/tests/test_sampler.py         | 15 +++++++++++++++
 benchmarks/tests/test_syntheiszer.py     | 15 +++++++++++++++
 benchmarks/utils/__init__.py             | 14 ++++++++++++++
 benchmarks/utils/logging.py              | 15 +++++++++++++++
 13 files changed, 192 insertions(+), 1 deletion(-)

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index e69de29bb2..f9c9993933 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/benchmarks/data_utils/README.md b/benchmarks/data_utils/README.md
index d21f4e3a63..be846c713b 100644
--- a/benchmarks/data_utils/README.md
+++ b/benchmarks/data_utils/README.md
@@ -1,8 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 This directory is currently used for generate synthetic data based on the mooncake dataset, but should be easily extendible to any request datasets with (prefix) hash ids, with a current cavaet. The synthesizer is designed to work for jsonl files in the "mooncake" trace file format, meaning that the input are increasing integers of block hashes. For now, new block hashes must be the next consecutive integer, otherwise will not work.
 
 ## Quickstart
 
-For instance, you can run:
+For instance, you can run from the project root:
 ```
 python -m benchmark.data_synth.synthesizer \
 --input-file mooncake_trace.jsonl \
diff --git a/benchmarks/data_utils/__init__.py b/benchmarks/data_utils/__init__.py
index e69de29bb2..f9c9993933 100644
--- a/benchmarks/data_utils/__init__.py
+++ b/benchmarks/data_utils/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/benchmarks/data_utils/graph_utils.py b/benchmarks/data_utils/graph_utils.py
index 82f50aa5b2..8a46dd8757 100644
--- a/benchmarks/data_utils/graph_utils.py
+++ b/benchmarks/data_utils/graph_utils.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import networkx as nx
 
diff --git a/benchmarks/data_utils/prefix_analyzer.py b/benchmarks/data_utils/prefix_analyzer.py
index cfa44875f8..31bd4590f5 100644
--- a/benchmarks/data_utils/prefix_analyzer.py
+++ b/benchmarks/data_utils/prefix_analyzer.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 from collections import Counter
 
diff --git a/benchmarks/data_utils/protocols.py b/benchmarks/data_utils/protocols.py
index edb4d3394c..96039f4715 100644
--- a/benchmarks/data_utils/protocols.py
+++ b/benchmarks/data_utils/protocols.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Protocol-level constants for synthetic data graph structure.
 """
diff --git a/benchmarks/data_utils/sampler.py b/benchmarks/data_utils/sampler.py
index 9356ee5dca..803e4148e3 100644
--- a/benchmarks/data_utils/sampler.py
+++ b/benchmarks/data_utils/sampler.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import logging
 from collections import Counter
 from typing import Any, Dict, List, Optional, Tuple, Union
diff --git a/benchmarks/data_utils/synthesizer.py b/benchmarks/data_utils/synthesizer.py
index 954947e565..cd35202608 100644
--- a/benchmarks/data_utils/synthesizer.py
+++ b/benchmarks/data_utils/synthesizer.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 from collections import Counter
 
diff --git a/benchmarks/tests/__init__.py b/benchmarks/tests/__init__.py
index e69de29bb2..f9c9993933 100644
--- a/benchmarks/tests/__init__.py
+++ b/benchmarks/tests/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/benchmarks/tests/test_sampler.py b/benchmarks/tests/test_sampler.py
index 45879136a5..0901c2b776 100644
--- a/benchmarks/tests/test_sampler.py
+++ b/benchmarks/tests/test_sampler.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from collections import Counter
 
diff --git a/benchmarks/tests/test_syntheiszer.py b/benchmarks/tests/test_syntheiszer.py
index 4480c6617b..3e76f51802 100644
--- a/benchmarks/tests/test_syntheiszer.py
+++ b/benchmarks/tests/test_syntheiszer.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 import random
diff --git a/benchmarks/utils/__init__.py b/benchmarks/utils/__init__.py
index e69de29bb2..f9c9993933 100644
--- a/benchmarks/utils/__init__.py
+++ b/benchmarks/utils/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/benchmarks/utils/logging.py b/benchmarks/utils/logging.py
index 7317858ee0..a60e4aae86 100644
--- a/benchmarks/utils/logging.py
+++ b/benchmarks/utils/logging.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict, List, Any
 import numpy as np
 import pandas as pd

From 152b1f1efc32bb2f9a37c6884f24e62e05532d15 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 17:15:37 -0700
Subject: [PATCH 10/50] black (blank line at end)

---
 benchmarks/__init__.py            | 2 +-
 benchmarks/data_utils/__init__.py | 2 +-
 benchmarks/tests/__init__.py      | 2 +-
 benchmarks/utils/__init__.py      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index f9c9993933..3159bfe656 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -11,4 +11,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
diff --git a/benchmarks/data_utils/__init__.py b/benchmarks/data_utils/__init__.py
index f9c9993933..3159bfe656 100644
--- a/benchmarks/data_utils/__init__.py
+++ b/benchmarks/data_utils/__init__.py
@@ -11,4 +11,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
diff --git a/benchmarks/tests/__init__.py b/benchmarks/tests/__init__.py
index f9c9993933..3159bfe656 100644
--- a/benchmarks/tests/__init__.py
+++ b/benchmarks/tests/__init__.py
@@ -11,4 +11,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
diff --git a/benchmarks/utils/__init__.py b/benchmarks/utils/__init__.py
index f9c9993933..3159bfe656 100644
--- a/benchmarks/utils/__init__.py
+++ b/benchmarks/utils/__init__.py
@@ -11,4 +11,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.

From 759fabdb3c84354cd8eeb350f7e965b11cb5eeb3 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 18:33:33 -0700
Subject: [PATCH 11/50] fix prompt len bug

---
 benchmarks/data_utils/README.md          |  69 ++++--
 benchmarks/data_utils/prefix_analyzer.py | 254 ++++-------------------
 benchmarks/data_utils/synthesizer.py     |  27 +--
 benchmarks/utils/logging.py              |   3 +-
 4 files changed, 113 insertions(+), 240 deletions(-)

diff --git a/benchmarks/data_utils/README.md b/benchmarks/data_utils/README.md
index be846c713b..4eaf2e18e7 100644
--- a/benchmarks/data_utils/README.md
+++ b/benchmarks/data_utils/README.md
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+<!-- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,24 +11,54 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations under the License. -->
 
-This directory is currently used for generate synthetic data based on the mooncake dataset, but should be easily extendible to any request datasets with (prefix) hash ids, with a current cavaet. The synthesizer is designed to work for jsonl files in the "mooncake" trace file format, meaning that the input are increasing integers of block hashes. For now, new block hashes must be the next consecutive integer, otherwise will not work.
+## Prefix Analyzer
 
-## Quickstart
+The Prefix Analyzer provides statistics on the original trace file, such as Input Sequence Length (ISL), Output Sequence Length (OSL), and theoretical cache hit rate.
+It is useful for understanding the structure and reuse patterns in your dataset.
 
-For instance, you can run from the project root:
+```bash
+python -m benchmarks.data_utils.prefix_analyzer --input-file <path_to_trace.jsonl> --block-size <block_size>
 ```
-python -m benchmark.data_synth.synthesizer \
---input-file mooncake_trace.jsonl \
---num-requests 500 \
---depth-multiplier 4 \
---width-multiplier 4 \
---prompt-len-multiplier 0.1
+
+- `--input-file`: Path to your trace file in jsonl format (default: `mooncake_trace.jsonl`)
+- `--block-size`: Block size for prefix calculation (default: 512)
+
+---
+
+The script will print out summary statistics for ISL, OSL, user prompt lengths, and the theoretical cache hit rate (assuming an infinite cache).
+
+## Synthesizer
+
+The Synthesizer goes a step further:
+It builds a prefix tree from the original trace file, extracts prefix statistics, and generates a new synthetic dataset based on these statistics.
+You can control various aspects of the synthetic data generation with tunable knobs, such as request rate, context/prompt length multipliers, and the number of tree copies.
+
+This is useful for generating large, realistic synthetic traces for benchmarking or simulation, while preserving the structural properties of the original dataset.
+
+### How to run
+
+```bash
+python -m benchmarks.data_utils.synthesizer --input-file <path_to_trace.jsonl> --num-requests <N> [other options...]
 ```
-where `num-requests` sets the number of total synthetic requests generated, `speedup-ratio` tunes the rate at which the requests are sent, `depth-multiplier` tunes the lengths of the request prefixes (higher multiplier will then yield longer ISLs), and `width-multiplier` controls the branching factor of the synthetic requests (higher multiplier will generate more diverse request patterns).
 
-## How it works
+**Options:**
+- `--input-file`: Path to the input trace file (default: `mooncake_trace.jsonl`)
+- `--num-requests`: Number of requests to synthesize (default: 100000)
+- `--speedup-ratio`: Factor to speed up request intervals (default: 1)
+- `--depth-multiplier`: Multiplier for prefix lengths (default: 1.0)
+- `--width-multiplier`: Number of times to replicate the core radix tree (default: 1)
+- `--prompt-len-multiplier`: Multiplier for leaf path lengths (default: 1.0, use <1 for shorter prompts)
+- `--max-isl`: Maximum input sequence length to include in output (default: None, no filtering)
+- `--block-size`: Block size for prefilling and decoding (default: 512)
+- `--output-file`: Path to the output file (default: auto-generated from input file and options)
+
+---
+
+This directory is currently used for generating synthetic data based on the mooncake dataset, but should be easily extendible to any request datasets with (prefix) hash ids, with a current caveat. The synthesizer is designed to work for jsonl files in the "mooncake" trace file format, meaning that the input are increasing integers of block hashes. For now, new block hashes must be the next consecutive integer, otherwise will not work.
+
+### How it works
 
 The generation algorithm, simplified, is as follows
 
@@ -40,8 +70,15 @@ The generation algorithm, simplified, is as follows
 
 ## Testing
 
-To test for "correctness", or faithfulness to the original mooncake statistics, one can run
+To test for "correctness", or faithfulness to the original trace statistics, one can run
 ```
-python mooncake_synth.py --num-requests 500000
+python -m benchmarks.data_utils.synthesizer \
+--input-file mooncake_trace.jsonl \
+--num-requests 500000 \
+```
+and compare the synthetic ISL statistics (mean, median, std) to the original ISL statistics, which one can obtain by running
+```
+python -m benchmarks.data_utils.prefix_analyzer \
+--input-file mooncake_trace.jsonl \
 ```
-and compare the synthetic ISL statistics (mean, median, std) to the original ISL statistics. I find this to be the most "robust" end-to-end test.
+I find this to be the most "robust" end-to-end test. It is important to sample a large number of requests (e.g., hundreds of thousands) to ensure the statistics are meaningful, due to the law of large numbers. In particular, the mean statistics (such as mean ISL) should be well preserved in the synthetic data. However, the standard deviation statistics—especially for ISL—are not expected to match exactly, since the synthesizer does not capture the correlation between context length and prompt length present in the original data.
diff --git a/benchmarks/data_utils/prefix_analyzer.py b/benchmarks/data_utils/prefix_analyzer.py
index 31bd4590f5..019882b115 100644
--- a/benchmarks/data_utils/prefix_analyzer.py
+++ b/benchmarks/data_utils/prefix_analyzer.py
@@ -16,11 +16,7 @@
 import json
 from collections import Counter
 
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-from tabulate import tabulate
-from tqdm import tqdm
+from benchmarks.utils.logging import calculate_and_print_statistics
 
 
 class PrefixAnalyzer:
@@ -40,9 +36,11 @@ def __init__(self, dataset_path, block_size=1):
         self.block_size = block_size
         self.dataset = self._load_dataset()
         self.hash_counter = self._build_hash_counter()
-        self.repeated_hash_ids = self._find_repeated_hash_ids()
+        self.repeated_hash_ids = {
+            hash_id for hash_id, count in self.hash_counter.items() if count > 1
+        }
 
-    def _load_dataset(self):
+    def _load_dataset(self) -> list:
         print(f"Loading dataset from {self.dataset_path}...")
         dataset = []
         with open(self.dataset_path, "r") as f:
@@ -51,18 +49,15 @@ def _load_dataset(self):
         print(f"Dataset loaded: {len(dataset)} examples")
         return dataset
 
-    def _build_hash_counter(self):
+    def _build_hash_counter(self) -> Counter:
         all_hash_ids = []
-        for item in tqdm(self.dataset, desc="Processing hash IDs"):
+        for item in self.dataset:
             all_hash_ids.extend(item["hash_ids"])
         counter = Counter(all_hash_ids)
         print(f"Hash counter built: {len(counter)} unique hash IDs")
         return counter
 
-    def _find_repeated_hash_ids(self):
-        return {hash_id for hash_id, count in self.hash_counter.items() if count > 1}
-
-    def analyze_dataset_lengths(self):
+    def analyze(self) -> dict[str, list]:
         """
         Analyze dataset to extract various length metrics and print statistics.
 
@@ -77,11 +72,7 @@ def analyze_dataset_lengths(self):
         prefix_lengths = []
         user_prompt_lengths = []
 
-        for i, item in tqdm(
-            enumerate(self.dataset),
-            total=len(self.dataset),
-            desc="Analyzing dataset lengths",
-        ):
+        for i, item in enumerate(self.dataset):
             input_len = item["input_length"]
             hash_ids = item["hash_ids"]
             assert len(hash_ids) * self.block_size >= input_len
@@ -105,133 +96,30 @@ def analyze_dataset_lengths(self):
             if prefix_len > input_len:
                 print(f"WARNING: Line {i}: {json.dumps(item)}")
 
+        cache_hit_rates = self._analyze_cache_hit_rates()
+
         # Print statistics table
         metrics = {
             "Input Length": input_lengths,
-            "Prefix Length": prefix_lengths,
-            "User Prompt Length": user_prompt_lengths,
+            "Context Length": prefix_lengths,
+            "Unique Prompt Length": user_prompt_lengths,
             "Output Length": output_lengths,
+            "Theoretical Hit Rates": cache_hit_rates,
         }
 
-        print_statistics_table(metrics)
+        calculate_and_print_statistics(metrics)
 
-        return input_lengths, prefix_lengths, user_prompt_lengths, output_lengths
+        return metrics
 
-    def visited_radix_lens(self, ax=None, legend=None):
-        """
-        Analyze radix lengths based on hash IDs with the same repetition count as the first hash.
-
-        Args:
-            ax: Matplotlib axis handle for plotting (if None, creates a new figure)
-            legend: Legend label for this dataset in the plot
-
-        Returns:
-            The matplotlib axis handle
-        """
-        # For each row, calculate the radix length based on the first hash's repetition count
-        radix_lengths = []
-
-        for item in tqdm(self.dataset):
-            # Skip if there are no hash_ids
-            if len(item["hash_ids"]) == 0:
-                continue
-
-            # Get the repetition count of the first hash ID
-            first_hash = item["hash_ids"][0]
-            first_hash_repetition_count = self.hash_counter[first_hash]
-            if first_hash_repetition_count <= 1:
-                continue
-
-            # Count how many hash IDs in this row have the same repetition count
-            matching_hash_ids = sum(
-                1
-                for hash_id in item["hash_ids"]
-                if self.hash_counter[hash_id] == first_hash_repetition_count
-            )
-
-            # Calculate radix length
-            radix_length = matching_hash_ids * self.block_size
-            radix_lengths.append((first_hash, radix_length))
-
-        # Count occurrences of each (first_hash, radix_length) tuple
-        radix_lens_counter = Counter(radix_lengths)
-
-        # Create a new figure if no axis is provided
-        if ax is None:
-            fig, ax = plt.subplots(figsize=(10, 6))
-
-        # Extract x and y values for plotting
-        x_values = [tup[1] for tup in radix_lens_counter.keys()]
-        y_values = list(radix_lens_counter.values())
-
-        # Plot with legend if provided
-        scatter = ax.scatter(
-            x_values, y_values, alpha=0.7, label=legend + " (trunk)" if legend else None
-        )
-
-        # Now analyze based on the last hash with nonzero repeat count
-        radix_lengths_last = []
-        for item in tqdm(self.dataset):
-            # Skip if there are no hash_ids
-            if len(item["hash_ids"]) == 0:
-                continue
-
-            # Find the last hash with nonzero repeat count
-            last_hash = None
-            for hash_id in reversed(item["hash_ids"]):
-                if self.hash_counter[hash_id] > 1:
-                    last_hash = hash_id
-                    break
-
-            if last_hash is None:
-                continue
-
-            last_hash_repetition_count = self.hash_counter[last_hash]
-
-            # Count how many hash IDs in this row have repeat count greater than the last hash's
-            matching_hash_ids = sum(
-                1
-                for hash_id in item["hash_ids"]
-                if self.hash_counter[hash_id] > last_hash_repetition_count
-            )
-
-            # Calculate radix length
-            radix_length = matching_hash_ids * self.block_size
-            radix_lengths_last.append((last_hash, radix_length))
-
-        # Count occurrences of each (last_hash, radix_length) tuple
-        radix_lens_last_counter = Counter(radix_lengths_last)
-
-        # Extract x and y values for plotting
-        x_values_last = [tup[1] for tup in radix_lens_last_counter.keys()]
-        y_values_last = list(radix_lens_last_counter.values())
-
-        # Plot with legend if provided
-        scatter_last = ax.scatter(
-            x_values_last,
-            y_values_last,
-            alpha=0.7,
-            marker="x",
-            label=legend + " (branch)" if legend else None,
-        )
-
-        # Add legend if any labels exist
-        if legend is not None:
-            ax.legend()
-
-        ax.set_xlabel("Radix Length")
-        ax.set_ylabel("Visited")
-        ax.set_xscale("log")
-        # ax.set_yscale('log')
-        ax.grid(True, linestyle="--", alpha=0.7)
-
-        # Return the axis for further modifications
-        return ax
-
-    def analyze_cache_hit_rates(self):
+    def _analyze_cache_hit_rates(self) -> list[float]:
         """
         Analyze theoretical cache hit rates based on hash ID repetition.
 
+        Assumes that hash IDs are cached as the dataset is iterated through,
+        i.e., each hash ID is considered "cached" after its first appearance,
+        similar to how KV caching would work in real life.
+        Assumes the cache is infinite in size (hence "theoretical"), so no hash IDs are ever evicted.
+
         Returns:
             List of cache hit rates for each row in the dataset
         """
@@ -241,7 +129,7 @@ def analyze_cache_hit_rates(self):
         # Store cache hit rates for each row
         cache_hit_rates = []
 
-        for item in tqdm(self.dataset, desc="Calculating cache hit rates"):
+        for item in self.dataset:
             hash_ids = item["hash_ids"]
 
             # Skip if there are no hash IDs
@@ -262,78 +150,29 @@ def analyze_cache_hit_rates(self):
             # Add all hash IDs from this row to the seen set
             seen_hash_ids.update(hash_ids)
 
-        # Create histogram of cache hit rates
-        plt.figure(figsize=(10, 6))
-        plt.hist(
-            cache_hit_rates, bins=50, alpha=0.7, color="skyblue", edgecolor="black"
-        )
-        plt.xlabel("Cache Hit Rate")
-        plt.ylabel("Frequency")
-        plt.title("Theoretical Cache Hit Rates")
-
-        # Add statistics text to the plot directly
-        stats_text = (
-            f"Mean: {np.mean(cache_hit_rates):.4f}\n"
-            f"Median: {np.median(cache_hit_rates):.4f}\n"
-            f"Min: {np.min(cache_hit_rates):.4f}\n"
-            f"Max: {np.max(cache_hit_rates):.4f}\n"
-            f"Std Dev: {np.std(cache_hit_rates):.4f}"
-        )
-
-        # Position the text in the upper right corner with some padding
-        plt.text(
-            0.95,
-            0.95,
-            stats_text,
-            transform=plt.gca().transAxes,
-            verticalalignment="top",
-            horizontalalignment="right",
-            bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
-        )
-
-        plt.grid(True, linestyle="--", alpha=0.7)
-        plt.savefig("theoretical_hit_rates.png", dpi=300, bbox_inches="tight")
-        plt.close()
-
         return cache_hit_rates
 
 
-def print_statistics_table(metrics):
-    """
-    Print a formatted table of statistics for the given metrics.
-
-    Args:
-        metrics: Dictionary mapping metric names to lists of values
-    """
-    stats_data = []
-    for metric_name, values in metrics.items():
-        stats_data.append(
-            {
-                "Metric": metric_name,
-                "Mean": np.mean(values),
-                "Std Dev": np.std(values),
-                "Min": np.min(values),
-                "P25": np.percentile(values, 25),
-                "Median": np.median(values),
-                "P75": np.percentile(values, 75),
-                "Max": np.max(values),
-            }
-        )
-
-    # Create DataFrame from the collected statistics
-    stats_df = pd.DataFrame(stats_data)
-    stats_df = stats_df.set_index("Metric")
-    stats_df = stats_df.round(2)
-
-    # Print the table using tabulate with a pretty format
-    print(tabulate(stats_df, headers="keys", tablefmt="pretty"))
-
-
 if __name__ == "__main__":
-    # Main routine that uses the specified dataset with block size of 16
-    block_size = 16
-    dataset_path = f"../datasets/avian_r100000_bs{block_size}_synth.jsonl"
-    # dataset_path = "/home/rupei/nova-benchmarking/datasets/gen_prompts_32k_2_languages_16.jsonl"
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Analyze prefix dataset statistics")
+    parser.add_argument(
+        "--input-file",
+        type=str,
+        default="mooncake_trace.jsonl",
+        help="Path to the input dataset file (default: mooncake_trace.jsonl)",
+    )
+    parser.add_argument(
+        "--block-size",
+        type=int,
+        default=512,
+        help="Block size for prefix calculation (default: 512)",
+    )
+    args = parser.parse_args()
+
+    block_size = args.block_size
+    dataset_path = args.input_file
 
     print(f"Analyzing dataset: {dataset_path}")
     print(f"Using block size: {block_size}")
@@ -341,11 +180,4 @@ def print_statistics_table(metrics):
 
     # Create analyzer instance
     analyzer = PrefixAnalyzer(dataset_path, block_size=block_size)
-
-    # Run analyses
-    input_lens, prefix_lens, user_prompt_lens, output_lens = (
-        analyzer.analyze_dataset_lengths()
-    )
-    analyzer.analyze_cache_hit_rates()
-
-    print(f"\nAnalysis complete. Processed {len(input_lens)} examples.")
+    analyzer.analyze()
diff --git a/benchmarks/data_utils/synthesizer.py b/benchmarks/data_utils/synthesizer.py
index cd35202608..de297f480c 100644
--- a/benchmarks/data_utils/synthesizer.py
+++ b/benchmarks/data_utils/synthesizer.py
@@ -15,20 +15,20 @@
 
 import json
 from collections import Counter
+from typing import Any, Optional
 
 import networkx as nx
 import numpy as np
 import pandas as pd
-from typing import Optional
 
-from benchmarks.utils.logging import calculate_and_print_statistics
 from benchmarks.data_utils.graph_utils import (
     _merge_chains,
     _precompute_transition_cdfs,
     _remove_leaves,
 )
+from benchmarks.data_utils.protocols import CACHE_END, END_NODE, SUPER_ROOT
 from benchmarks.data_utils.sampler import EmpiricalSampler, sample_from_cdf
-from benchmarks.data_utils.protocols import SUPER_ROOT, CACHE_END, END_NODE
+from benchmarks.utils.logging import calculate_and_print_statistics
 
 
 class Synthesizer:
@@ -250,7 +250,7 @@ def synthesize_path(self) -> tuple[list[int], bool, int]:
                 break
             # break and don't sample leaf
             if next_node == END_NODE:
-                return path, False, 0
+                return path, False, context_len
 
             # otherwise continue down prefix tree
 
@@ -270,7 +270,7 @@ def synthesize_path(self) -> tuple[list[int], bool, int]:
 
     def synthesize_requests(
         self, num_requests: int, input_len_filter: Optional[int] = None
-    ) -> list[dict[str, any]]:
+    ) -> list[dict[str, Any]]:
         timestamp = 0
 
         requests = []
@@ -342,9 +342,9 @@ def __repr__(self) -> str:
 
         rep += "\nRoot nodes (grouped by length, visited count ≥ 5):\n"
         for length, group in grouped:
-            top_nodes = group.head(5)
-            visit_counts = top_nodes["Visited Count"].tolist()
-            rep += f"\nLength: {length}, Visited Counts: {visit_counts}"
+            nodes = group["Child Node"].tolist()
+            visit_counts = group["Visited Count"].tolist()
+            rep += f"\nNodes: {nodes}, Path Length: {length}, Visited Counts: {visit_counts}"
 
         return rep
 
@@ -356,7 +356,7 @@ def __repr__(self) -> str:
     parser = argparse.ArgumentParser(description="Synthesize Mooncake-Esque dataset")
     parser.add_argument(
         "--input-file",
-        default="../datasets/mooncake_trace.jsonl",
+        default="mooncake_trace.jsonl",
         type=str,
         help="Path to the input CSV file",
     )
@@ -413,7 +413,10 @@ def __repr__(self) -> str:
     dataset_file = Path(args.input_file).resolve()
     if args.output_file is None:
         output_file = dataset_file.with_stem(
-            f"{dataset_file.stem}_synthesized_{int(args.depth_multiplier)}x{args.width_multiplier}+{args.prompt_len_multiplier}+{args.speedup_ratio}+{args.max_isl}"
+            f"{dataset_file.stem}_synth"
+            + f"_{int(args.depth_multiplier)}x{args.width_multiplier}+{args.prompt_len_multiplier}"
+            + f"_speedup{args.speedup_ratio}"
+            + f"_maxisl{args.max_isl}"
         )
     else:
         output_file = Path(args.output_file).resolve()
@@ -437,10 +440,10 @@ def __repr__(self) -> str:
 
     # Extract all values first
     metrics = {
-        "ISL": [req["input_length"] for req in requests],
+        "Input Length": [req["input_length"] for req in requests],
         "Context Length": [req["context_len"] for req in requests],
         "Unique Prompt Length": [req["unique_user_prompt_len"] for req in requests],
-        "OSL": [req["output_length"] for req in requests],
+        "Output Length": [req["output_length"] for req in requests],
     }
 
     # Initialize lists to store the data
diff --git a/benchmarks/utils/logging.py b/benchmarks/utils/logging.py
index a60e4aae86..cf5715a263 100644
--- a/benchmarks/utils/logging.py
+++ b/benchmarks/utils/logging.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, List, Any
+from typing import Dict, List
+
 import numpy as np
 import pandas as pd
 from tabulate import tabulate

From a57d8ae55027c46a7f8b174173ba80cd207c0689 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 18:37:46 -0700
Subject: [PATCH 12/50] no tabulate dep

---
 benchmarks/utils/logging.py     | 3 +--
 container/deps/requirements.txt | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/benchmarks/utils/logging.py b/benchmarks/utils/logging.py
index cf5715a263..a6212d87a5 100644
--- a/benchmarks/utils/logging.py
+++ b/benchmarks/utils/logging.py
@@ -17,7 +17,6 @@
 
 import numpy as np
 import pandas as pd
-from tabulate import tabulate
 
 
 def calculate_and_print_statistics(metrics: Dict[str, List[float]]) -> pd.DataFrame:
@@ -49,6 +48,6 @@ def calculate_and_print_statistics(metrics: Dict[str, List[float]]) -> pd.DataFr
         )
 
     stats_df = pd.DataFrame(stats_data, index=metric_names)
-    print(tabulate(stats_df.round(2), headers="keys", tablefmt="pretty"))
+    print(stats_df, "\n")
 
     return stats_df
diff --git a/container/deps/requirements.txt b/container/deps/requirements.txt
index e4efcae387..6e3d13490d 100644
--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -33,7 +33,6 @@ pydantic==2.7.1
 pyright
 PyYAML
 sentencepiece
-tabulate
 tensorboard==2.19.0
 tensorboardX==2.6.2.2
 transformers

From 4c32c52001546dc875c55dc012656c874d6259a7 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 18:42:30 -0700
Subject: [PATCH 13/50] pandas license

---
 ATTRIBUTIONS.md | 120 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 81 insertions(+), 39 deletions(-)

diff --git a/ATTRIBUTIONS.md b/ATTRIBUTIONS.md
index 5434dd6a21..f42fd441ca 100644
--- a/ATTRIBUTIONS.md
+++ b/ATTRIBUTIONS.md
@@ -228,44 +228,6 @@ limitations under the License.
 
    ```
 
-## networkx - [3-Clause BSD License](https://github.com/networkx/networkx/blob/main/LICENSE.txt)
-
-   ```
-Copyright (C) 2004-2024, NetworkX Developers
-Aric Hagberg <hagberg@lanl.gov>
-Dan Schult <dschult@colgate.edu>
-Pieter Swart <swart@lanl.gov>
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-   * Redistributions in binary form must reproduce the above
-      copyright notice, this list of conditions and the following
-      disclaimer in the documentation and/or other materials provided
-      with the distribution.
-
-   * Neither the name of the NetworkX Developers nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-   ```
-
 ## ucx-py-cu12 - [BSD 3-Clause "New" or "Revised" License](https://github.com/rapidsai/ucx-py/blob/main/LICENSE)
 
    ```
@@ -529,6 +491,8 @@ THE SOFTWARE.
    limitations under the License.
    ```
 
+
+
 ## pydantic - [MIT License](https://github.com/pydantic/pydantic/blob/main/LICENSE)
 
    ```
@@ -3470,4 +3434,82 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
-```
\ No newline at end of file
+```
+
+## networkx - [BSD 3-Clause License](https://github.com/networkx/networkx/blob/main/LICENSE.txt)
+
+   ```
+NetworkX is distributed with the 3-clause BSD license.
+
+::
+
+   Copyright (C) 2004-2024, NetworkX Developers
+   Aric Hagberg <hagberg@lanl.gov>
+   Dan Schult <dschult@colgate.edu>
+   Pieter Swart <swart@lanl.gov>
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+     * Neither the name of the NetworkX Developers nor the names of its
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+   ```
+
+## pandas - [BSD 3-Clause License](https://github.com/pandas-dev/pandas/blob/main/LICENSE)
+
+   ```
+BSD 3-Clause License
+
+Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
+All rights reserved.
+
+Copyright (c) 2011-2025, Open source contributors.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+   ```
\ No newline at end of file

From 905d04b0a41bef38be00ef5e40856dbf6ad0c057 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 18:47:14 -0700
Subject: [PATCH 14/50] rm extra new lines

---
 ATTRIBUTIONS.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ATTRIBUTIONS.md b/ATTRIBUTIONS.md
index f42fd441ca..41210e55b4 100644
--- a/ATTRIBUTIONS.md
+++ b/ATTRIBUTIONS.md
@@ -491,8 +491,6 @@ THE SOFTWARE.
    limitations under the License.
    ```
 
-
-
 ## pydantic - [MIT License](https://github.com/pydantic/pydantic/blob/main/LICENSE)
 
    ```

From 4e41f5b629d42b6ffab58bde03c6ea8405059942 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 18:49:37 -0700
Subject: [PATCH 15/50] pre commits

---
 benchmarks/data_utils/graph_utils.py | 4 ++--
 benchmarks/tests/test_sampler.py     | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/data_utils/graph_utils.py b/benchmarks/data_utils/graph_utils.py
index 8a46dd8757..7dda478c27 100644
--- a/benchmarks/data_utils/graph_utils.py
+++ b/benchmarks/data_utils/graph_utils.py
@@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import networkx as nx
+import numpy as np
 
+from benchmarks.data_utils.protocols import CACHE_END, END_NODE, SUPER_ROOT
 from benchmarks.data_utils.sampler import get_cdf
-from benchmarks.data_utils.protocols import SUPER_ROOT, CACHE_END, END_NODE
 
 
 def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
diff --git a/benchmarks/tests/test_sampler.py b/benchmarks/tests/test_sampler.py
index 0901c2b776..dcbbc5ac63 100644
--- a/benchmarks/tests/test_sampler.py
+++ b/benchmarks/tests/test_sampler.py
@@ -13,9 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 from collections import Counter
 
+import numpy as np
+
 from benchmarks.data_utils.sampler import EmpiricalSampler
 
 

From 20040ebe59ac3b17321e4f873a5b7092f5b676e9 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 20:47:24 -0700
Subject: [PATCH 16/50] make mypy happy

---
 benchmarks/data_utils/synthesizer.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/benchmarks/data_utils/synthesizer.py b/benchmarks/data_utils/synthesizer.py
index de297f480c..7ad392cb90 100644
--- a/benchmarks/data_utils/synthesizer.py
+++ b/benchmarks/data_utils/synthesizer.py
@@ -233,7 +233,7 @@ def synthesize_path(self) -> tuple[list[int], bool, int]:
         """
         # Start from root node (-1)
         current_node = SUPER_ROOT
-        path = []
+        path: list[int] = []
         context_len = 0
 
         # Continue until we reach a node with no outgoing edges
@@ -273,7 +273,7 @@ def synthesize_requests(
     ) -> list[dict[str, Any]]:
         timestamp = 0
 
-        requests = []
+        requests: list[dict[str, Any]] = []
         request_id = 0
 
         while request_id < num_requests:
@@ -423,7 +423,7 @@ def __repr__(self) -> str:
 
     print("learning from dataset...", flush=True)
     synthesizer = Synthesizer(
-        dataset_file,
+        str(dataset_file),
         block_size=args.block_size,
         speedup_ratio=args.speedup_ratio,
         context_len_multiplier=args.depth_multiplier,
@@ -446,10 +446,6 @@ def __repr__(self) -> str:
         "Output Length": [req["output_length"] for req in requests],
     }
 
-    # Initialize lists to store the data
-    metric_names = []
-    stats_data = []
-
     # Calculate statistics for each metric
     calculate_and_print_statistics(metrics)
 

From 4d10bf8d81848fbe84d99d22b12f6ad6b1f62a6f Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 21:29:27 -0700
Subject: [PATCH 17/50] rolling hasher

---
 benchmarks/data_utils/hasher.py | 59 +++++++++++++++++++++++++++++++++
 benchmarks/tests/test_hasher.py | 40 ++++++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 benchmarks/data_utils/hasher.py
 create mode 100644 benchmarks/tests/test_hasher.py

diff --git a/benchmarks/data_utils/hasher.py b/benchmarks/data_utils/hasher.py
new file mode 100644
index 0000000000..21b5cd34eb
--- /dev/null
+++ b/benchmarks/data_utils/hasher.py
@@ -0,0 +1,59 @@
+from typing import Dict, List
+
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+
+def texts_to_hashes(
+    tokenizer: PreTrainedTokenizerBase, texts: List[str], block_size: int = 512
+) -> List[List[int]]:
+    """
+    Tokenizes a list of strings (without special tokens), splits tokens into blocks,
+    computes rolling hashes, and returns a list of lists of integer-mapped rolling hashes
+    for each input string.
+
+    Args:
+        tokenizer: Tokenizer object with a .encode method.
+        texts (List[str]): List of input strings.
+        block_size (int): Size of each token block for hashing.
+
+    Returns:
+        List[List[int]]: List of lists of integer-mapped rolling hashes for each block of each input string.
+    """
+    # Batch tokenize for efficiency
+    batch_encoding = tokenizer(
+        texts,
+        add_special_tokens=False,
+        return_attention_mask=False,
+        return_token_type_ids=False,
+    )
+    # batch_encoding["input_ids"] is a List[List[int]]
+    all_tokens: List[List[int]] = batch_encoding["input_ids"]
+
+    results: List[List[int]] = []
+    hash_to_int: Dict[int, int] = {}
+    next_int = 0
+
+    for tokens in all_tokens:
+        blocks: List[List[int]] = [
+            tokens[i : i + block_size] for i in range(0, len(tokens), block_size)
+        ]
+
+        parent_hash = 0
+        hashes: List[int] = []
+
+        print(blocks)
+        for block in blocks:
+            combined = (parent_hash, hash(tuple(block)))
+            global_hash = hash(combined)
+
+            # Map global_hash to a unique integer
+            if global_hash not in hash_to_int:
+                hash_to_int[global_hash] = next_int
+                next_int += 1
+
+            hashes.append(hash_to_int[global_hash])
+            parent_hash = global_hash
+
+        results.append(hashes)
+
+    return results
diff --git a/benchmarks/tests/test_hasher.py b/benchmarks/tests/test_hasher.py
new file mode 100644
index 0000000000..5c50501f97
--- /dev/null
+++ b/benchmarks/tests/test_hasher.py
@@ -0,0 +1,40 @@
+import pytest
+from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers
+from transformers import PreTrainedTokenizerFast
+
+from benchmarks.data_utils.hasher import texts_to_hashes
+
+
+@pytest.fixture(scope="module")
+def dummy_tokenizer():
+    vocab = [chr(i) for i in range(ord("a"), ord("z") + 1)]
+    vocab.append("[UNK]")
+    vocab_dict = {token: idx for idx, token in enumerate(vocab)}
+
+    tokenizer_model = models.WordLevel(vocab=vocab_dict, unk_token="[UNK]")
+    tokenizer = Tokenizer(tokenizer_model)
+    tokenizer.normalizer = normalizers.Sequence(
+        [normalizers.NFD(), normalizers.Lowercase()]
+    )
+    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
+    tokenizer.decoder = decoders.WordPiece(prefix="")
+
+    return PreTrainedTokenizerFast(
+        tokenizer_object=tokenizer,
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        bos_token="[BOS]",
+        eos_token="[EOS]",
+    )
+
+
+def test_texts_to_hashes_blocks(dummy_tokenizer):
+    dum1 = "a b c d"
+    dum2 = "e f g h"
+    dum3 = "i j k l"
+
+    texts = [dum1, dum1 + " " + dum2, dum1 + " " + dum3, dum2 + " " + dum1]
+    expected = [[0], [0, 1], [0, 2], [3, 4]]
+
+    result = texts_to_hashes(dummy_tokenizer, texts, block_size=4)
+    assert result == expected, f"Expected {expected}, got {result}"

From 1517f1f76257b1965f351cd7c083b40d4fa0df71 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 21:31:21 -0700
Subject: [PATCH 18/50] readme update

---
 benchmarks/data_utils/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/data_utils/README.md b/benchmarks/data_utils/README.md
index 4eaf2e18e7..f94ebdf004 100644
--- a/benchmarks/data_utils/README.md
+++ b/benchmarks/data_utils/README.md
@@ -58,6 +58,8 @@ python -m benchmarks.data_utils.synthesizer --input-file <path_to_trace.jsonl> -
 
 This directory is currently used for generating synthetic data based on the mooncake dataset, but should be easily extendible to any request datasets with (prefix) hash ids, with a current caveat. The synthesizer is designed to work for jsonl files in the "mooncake" trace file format, meaning that the input are increasing integers of block hashes. For now, new block hashes must be the next consecutive integer, otherwise will not work.
 
+If you want to generate these increasing hash ids from a list of texts, you can use the `texts_to_hashes` function in `hasher.py`.
+
 ### How it works
 
 The generation algorithm, simplified, is as follows

From ec738d67d6e628475a3fd5c6928793833e4f36c1 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 21:32:55 -0700
Subject: [PATCH 19/50] license in test_hasher.py

---
 benchmarks/tests/test_hasher.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/benchmarks/tests/test_hasher.py b/benchmarks/tests/test_hasher.py
index 5c50501f97..11e9314dc0 100644
--- a/benchmarks/tests/test_hasher.py
+++ b/benchmarks/tests/test_hasher.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pytest
 from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers
 from transformers import PreTrainedTokenizerFast

From 34caf4172958a7e29e92b5f40d286c6570b1530c Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 14 May 2025 23:04:12 -0700
Subject: [PATCH 20/50] copyright in hasher.py

---
 benchmarks/data_utils/hasher.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/benchmarks/data_utils/hasher.py b/benchmarks/data_utils/hasher.py
index 21b5cd34eb..44a20cecab 100644
--- a/benchmarks/data_utils/hasher.py
+++ b/benchmarks/data_utils/hasher.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict, List
 
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase

From e065c54b2b59ce812389bbc69699d6417cd69b9d Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 17 May 2025 20:04:57 -0700
Subject: [PATCH 21/50] typo synthesizer

---
 benchmarks/tests/{test_syntheiszer.py => test_synthesizer.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename benchmarks/tests/{test_syntheiszer.py => test_synthesizer.py} (100%)

diff --git a/benchmarks/tests/test_syntheiszer.py b/benchmarks/tests/test_synthesizer.py
similarity index 100%
rename from benchmarks/tests/test_syntheiszer.py
rename to benchmarks/tests/test_synthesizer.py

From 1c85d8dcbe19bdf9c18a1891f2beab72413d0d0f Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 17 May 2025 20:06:55 -0700
Subject: [PATCH 22/50] logging into main dir

---
 benchmarks/{utils => data_utils}/logging.py |  0
 benchmarks/data_utils/prefix_analyzer.py    |  2 +-
 benchmarks/data_utils/synthesizer.py        |  2 +-
 benchmarks/utils/__init__.py                | 14 --------------
 4 files changed, 2 insertions(+), 16 deletions(-)
 rename benchmarks/{utils => data_utils}/logging.py (100%)
 delete mode 100644 benchmarks/utils/__init__.py

diff --git a/benchmarks/utils/logging.py b/benchmarks/data_utils/logging.py
similarity index 100%
rename from benchmarks/utils/logging.py
rename to benchmarks/data_utils/logging.py
diff --git a/benchmarks/data_utils/prefix_analyzer.py b/benchmarks/data_utils/prefix_analyzer.py
index 019882b115..e99634c346 100644
--- a/benchmarks/data_utils/prefix_analyzer.py
+++ b/benchmarks/data_utils/prefix_analyzer.py
@@ -16,7 +16,7 @@
 import json
 from collections import Counter
 
-from benchmarks.utils.logging import calculate_and_print_statistics
+from benchmarks.data_utils.logging import calculate_and_print_statistics
 
 
 class PrefixAnalyzer:
diff --git a/benchmarks/data_utils/synthesizer.py b/benchmarks/data_utils/synthesizer.py
index 7ad392cb90..20f15f943e 100644
--- a/benchmarks/data_utils/synthesizer.py
+++ b/benchmarks/data_utils/synthesizer.py
@@ -26,9 +26,9 @@
     _precompute_transition_cdfs,
     _remove_leaves,
 )
+from benchmarks.data_utils.logging import calculate_and_print_statistics
 from benchmarks.data_utils.protocols import CACHE_END, END_NODE, SUPER_ROOT
 from benchmarks.data_utils.sampler import EmpiricalSampler, sample_from_cdf
-from benchmarks.utils.logging import calculate_and_print_statistics
 
 
 class Synthesizer:
diff --git a/benchmarks/utils/__init__.py b/benchmarks/utils/__init__.py
deleted file mode 100644
index 3159bfe656..0000000000
--- a/benchmarks/utils/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

From 0e736830948c1ad675780791bee88a08c3889ac5 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 17 May 2025 20:07:49 -0700
Subject: [PATCH 23/50] rename to data_generator

---
 benchmarks/{data_utils => data_generator}/README.md       | 0
 benchmarks/{data_utils => data_generator}/__init__.py     | 0
 benchmarks/{data_utils => data_generator}/graph_utils.py  | 4 ++--
 benchmarks/{data_utils => data_generator}/hasher.py       | 0
 benchmarks/{data_utils => data_generator}/logging.py      | 0
 .../{data_utils => data_generator}/prefix_analyzer.py     | 2 +-
 benchmarks/{data_utils => data_generator}/protocols.py    | 0
 benchmarks/{data_utils => data_generator}/sampler.py      | 0
 benchmarks/{data_utils => data_generator}/synthesizer.py  | 8 ++++----
 benchmarks/tests/test_hasher.py                           | 2 +-
 benchmarks/tests/test_sampler.py                          | 2 +-
 benchmarks/tests/test_synthesizer.py                      | 2 +-
 12 files changed, 10 insertions(+), 10 deletions(-)
 rename benchmarks/{data_utils => data_generator}/README.md (100%)
 rename benchmarks/{data_utils => data_generator}/__init__.py (100%)
 rename benchmarks/{data_utils => data_generator}/graph_utils.py (96%)
 rename benchmarks/{data_utils => data_generator}/hasher.py (100%)
 rename benchmarks/{data_utils => data_generator}/logging.py (100%)
 rename benchmarks/{data_utils => data_generator}/prefix_analyzer.py (98%)
 rename benchmarks/{data_utils => data_generator}/protocols.py (100%)
 rename benchmarks/{data_utils => data_generator}/sampler.py (100%)
 rename benchmarks/{data_utils => data_generator}/synthesizer.py (98%)

diff --git a/benchmarks/data_utils/README.md b/benchmarks/data_generator/README.md
similarity index 100%
rename from benchmarks/data_utils/README.md
rename to benchmarks/data_generator/README.md
diff --git a/benchmarks/data_utils/__init__.py b/benchmarks/data_generator/__init__.py
similarity index 100%
rename from benchmarks/data_utils/__init__.py
rename to benchmarks/data_generator/__init__.py
diff --git a/benchmarks/data_utils/graph_utils.py b/benchmarks/data_generator/graph_utils.py
similarity index 96%
rename from benchmarks/data_utils/graph_utils.py
rename to benchmarks/data_generator/graph_utils.py
index 7dda478c27..82dbdea9e1 100644
--- a/benchmarks/data_utils/graph_utils.py
+++ b/benchmarks/data_generator/graph_utils.py
@@ -16,8 +16,8 @@
 import networkx as nx
 import numpy as np
 
-from benchmarks.data_utils.protocols import CACHE_END, END_NODE, SUPER_ROOT
-from benchmarks.data_utils.sampler import get_cdf
+from benchmarks.data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
+from benchmarks.data_generator.sampler import get_cdf
 
 
 def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
diff --git a/benchmarks/data_utils/hasher.py b/benchmarks/data_generator/hasher.py
similarity index 100%
rename from benchmarks/data_utils/hasher.py
rename to benchmarks/data_generator/hasher.py
diff --git a/benchmarks/data_utils/logging.py b/benchmarks/data_generator/logging.py
similarity index 100%
rename from benchmarks/data_utils/logging.py
rename to benchmarks/data_generator/logging.py
diff --git a/benchmarks/data_utils/prefix_analyzer.py b/benchmarks/data_generator/prefix_analyzer.py
similarity index 98%
rename from benchmarks/data_utils/prefix_analyzer.py
rename to benchmarks/data_generator/prefix_analyzer.py
index e99634c346..4c19fb5cb7 100644
--- a/benchmarks/data_utils/prefix_analyzer.py
+++ b/benchmarks/data_generator/prefix_analyzer.py
@@ -16,7 +16,7 @@
 import json
 from collections import Counter
 
-from benchmarks.data_utils.logging import calculate_and_print_statistics
+from benchmarks.data_generator.logging import calculate_and_print_statistics
 
 
 class PrefixAnalyzer:
diff --git a/benchmarks/data_utils/protocols.py b/benchmarks/data_generator/protocols.py
similarity index 100%
rename from benchmarks/data_utils/protocols.py
rename to benchmarks/data_generator/protocols.py
diff --git a/benchmarks/data_utils/sampler.py b/benchmarks/data_generator/sampler.py
similarity index 100%
rename from benchmarks/data_utils/sampler.py
rename to benchmarks/data_generator/sampler.py
diff --git a/benchmarks/data_utils/synthesizer.py b/benchmarks/data_generator/synthesizer.py
similarity index 98%
rename from benchmarks/data_utils/synthesizer.py
rename to benchmarks/data_generator/synthesizer.py
index 20f15f943e..2d114e61ce 100644
--- a/benchmarks/data_utils/synthesizer.py
+++ b/benchmarks/data_generator/synthesizer.py
@@ -21,14 +21,14 @@
 import numpy as np
 import pandas as pd
 
-from benchmarks.data_utils.graph_utils import (
+from benchmarks.data_generator.graph_utils import (
     _merge_chains,
     _precompute_transition_cdfs,
     _remove_leaves,
 )
-from benchmarks.data_utils.logging import calculate_and_print_statistics
-from benchmarks.data_utils.protocols import CACHE_END, END_NODE, SUPER_ROOT
-from benchmarks.data_utils.sampler import EmpiricalSampler, sample_from_cdf
+from benchmarks.data_generator.logging import calculate_and_print_statistics
+from benchmarks.data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
+from benchmarks.data_generator.sampler import EmpiricalSampler, sample_from_cdf
 
 
 class Synthesizer:
diff --git a/benchmarks/tests/test_hasher.py b/benchmarks/tests/test_hasher.py
index 11e9314dc0..e5a53cbd6e 100644
--- a/benchmarks/tests/test_hasher.py
+++ b/benchmarks/tests/test_hasher.py
@@ -17,7 +17,7 @@
 from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers
 from transformers import PreTrainedTokenizerFast
 
-from benchmarks.data_utils.hasher import texts_to_hashes
+from benchmarks.data_generator.hasher import texts_to_hashes
 
 
 @pytest.fixture(scope="module")
diff --git a/benchmarks/tests/test_sampler.py b/benchmarks/tests/test_sampler.py
index dcbbc5ac63..47b3f9c099 100644
--- a/benchmarks/tests/test_sampler.py
+++ b/benchmarks/tests/test_sampler.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 
-from benchmarks.data_utils.sampler import EmpiricalSampler
+from benchmarks.data_generator.sampler import EmpiricalSampler
 
 
 def test_empirical_sampler_distribution():
diff --git a/benchmarks/tests/test_synthesizer.py b/benchmarks/tests/test_synthesizer.py
index 3e76f51802..eb8e665ae1 100644
--- a/benchmarks/tests/test_synthesizer.py
+++ b/benchmarks/tests/test_synthesizer.py
@@ -19,7 +19,7 @@
 import tempfile
 import unittest
 
-from benchmarks.data_utils.synthesizer import Synthesizer
+from benchmarks.data_generator.synthesizer import Synthesizer
 
 
 # Helper function to create and dump data

From fa88dcb38c7f165fb8f98f96db1b97426944a3d0 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 17 May 2025 20:19:02 -0700
Subject: [PATCH 24/50] separate requirements for benchmarks

---
 container/deps/requirements.benchmarks.txt | 17 +++++++++++++++++
 container/deps/requirements.txt            |  2 --
 2 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 container/deps/requirements.benchmarks.txt

diff --git a/container/deps/requirements.benchmarks.txt b/container/deps/requirements.benchmarks.txt
new file mode 100644
index 0000000000..1565b1f5cd
--- /dev/null
+++ b/container/deps/requirements.benchmarks.txt
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+networkx
+pandas
diff --git a/container/deps/requirements.txt b/container/deps/requirements.txt
index 6e3d13490d..2546f65f85 100644
--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -21,11 +21,9 @@ httpx
 kubernetes==32.0.1
 msgspec
 mypy
-networkx
 numpy
 opentelemetry-api
 opentelemetry-sdk
-pandas
 pip==25.0.1
 pre-commit
 protobuf==5.27.3

From 8577a76a765357dd9342dd44085ec95e7d6fd43a Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 17 May 2025 20:24:14 -0700
Subject: [PATCH 25/50] link to mooncake trace + explanation

---
 benchmarks/data_generator/README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/benchmarks/data_generator/README.md b/benchmarks/data_generator/README.md
index f94ebdf004..ee215ca10e 100644
--- a/benchmarks/data_generator/README.md
+++ b/benchmarks/data_generator/README.md
@@ -29,6 +29,16 @@ python -m benchmarks.data_utils.prefix_analyzer --input-file <path_to_trace.json
 
 The script will print out summary statistics for ISL, OSL, user prompt lengths, and the theoretical cache hit rate (assuming an infinite cache).
 
+The trace file is expected to be in the [mooncake trace file format](https://github.com/kvcache-ai/Mooncake/blob/d21da178bae8db9651cf18a76824c084145fc725/mooncake_trace.jsonl). For example, the first few lines would look like this:
+
+```
+{"timestamp": 0, "input_length": 6755, "output_length": 500, "hash_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]}
+{"timestamp": 0, "input_length": 7319, "output_length": 490, "hash_ids": [0, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]}
+{"timestamp": 0, "input_length": 7234, "output_length": 794, "hash_ids": [0, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41]}
+{"timestamp": 0, "input_length": 2287, "output_length": 316, "hash_ids": [0, 42, 43, 44, 45]}
+```
+Note that each new hash id is the next consecutive integer after the last one used. To generate these increasing hash ids from a list of texts, we provide the `texts_to_hashes` function in `hasher.py`.
+
 ## Synthesizer
 
 The Synthesizer goes a step further:
@@ -58,8 +68,6 @@ python -m benchmarks.data_utils.synthesizer --input-file <path_to_trace.jsonl> -
 
 This directory is currently used for generating synthetic data based on the mooncake dataset, but should be easily extendible to any request datasets with (prefix) hash ids, with a current caveat. The synthesizer is designed to work for jsonl files in the "mooncake" trace file format, meaning that the input are increasing integers of block hashes. For now, new block hashes must be the next consecutive integer, otherwise will not work.
 
-If you want to generate these increasing hash ids from a list of texts, you can use the `texts_to_hashes` function in `hasher.py`.
-
 ### How it works
 
 The generation algorithm, simplified, is as follows

From a4ba2f3bdd135ff79606ba13d7bdb6a343d69bde Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 17 May 2025 20:27:32 -0700
Subject: [PATCH 26/50] move tests into data_generator

---
 .../{ => data_generator}/tests/test_hasher.py      |  0
 .../{ => data_generator}/tests/test_sampler.py     |  0
 .../{ => data_generator}/tests/test_synthesizer.py |  0
 benchmarks/tests/__init__.py                       | 14 --------------
 4 files changed, 14 deletions(-)
 rename benchmarks/{ => data_generator}/tests/test_hasher.py (100%)
 rename benchmarks/{ => data_generator}/tests/test_sampler.py (100%)
 rename benchmarks/{ => data_generator}/tests/test_synthesizer.py (100%)
 delete mode 100644 benchmarks/tests/__init__.py

diff --git a/benchmarks/tests/test_hasher.py b/benchmarks/data_generator/tests/test_hasher.py
similarity index 100%
rename from benchmarks/tests/test_hasher.py
rename to benchmarks/data_generator/tests/test_hasher.py
diff --git a/benchmarks/tests/test_sampler.py b/benchmarks/data_generator/tests/test_sampler.py
similarity index 100%
rename from benchmarks/tests/test_sampler.py
rename to benchmarks/data_generator/tests/test_sampler.py
diff --git a/benchmarks/tests/test_synthesizer.py b/benchmarks/data_generator/tests/test_synthesizer.py
similarity index 100%
rename from benchmarks/tests/test_synthesizer.py
rename to benchmarks/data_generator/tests/test_synthesizer.py
diff --git a/benchmarks/tests/__init__.py b/benchmarks/tests/__init__.py
deleted file mode 100644
index 3159bfe656..0000000000
--- a/benchmarks/tests/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

From 0a5c1808b706bfeecadf5a65a321470f642b3d03 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 17 May 2025 20:48:41 -0700
Subject: [PATCH 27/50] example in README

---
 benchmarks/data_generator/README.md | 42 ++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/benchmarks/data_generator/README.md b/benchmarks/data_generator/README.md
index ee215ca10e..6ef4a32608 100644
--- a/benchmarks/data_generator/README.md
+++ b/benchmarks/data_generator/README.md
@@ -25,8 +25,6 @@ python -m benchmarks.data_utils.prefix_analyzer --input-file <path_to_trace.json
 - `--input-file`: Path to your trace file in jsonl format (default: `mooncake_trace.jsonl`)
 - `--block-size`: Block size for prefix calculation (default: 512)
 
----
-
 The script will print out summary statistics for ISL, OSL, user prompt lengths, and the theoretical cache hit rate (assuming an infinite cache).
 
 The trace file is expected to be in the [mooncake trace file format](https://github.com/kvcache-ai/Mooncake/blob/d21da178bae8db9651cf18a76824c084145fc725/mooncake_trace.jsonl). For example, the first few lines would look like this:
@@ -56,7 +54,7 @@ python -m benchmarks.data_utils.synthesizer --input-file <path_to_trace.jsonl> -
 **Options:**
 - `--input-file`: Path to the input trace file (default: `mooncake_trace.jsonl`)
 - `--num-requests`: Number of requests to synthesize (default: 100000)
-- `--speedup-ratio`: Factor to speed up request intervals (default: 1)
+- `--speedup-ratio`: Factor to speed up request intervals. It effectively divides the synthetic timestamps by this value (default: 1)
 - `--depth-multiplier`: Multiplier for prefix lengths (default: 1.0)
 - `--width-multiplier`: Number of times to replicate the core radix tree (default: 1)
 - `--prompt-len-multiplier`: Multiplier for leaf path lengths (default: 1.0, use <1 for shorter prompts)
@@ -64,11 +62,43 @@ python -m benchmarks.data_utils.synthesizer --input-file <path_to_trace.jsonl> -
 - `--block-size`: Block size for prefilling and decoding (default: 512)
 - `--output-file`: Path to the output file (default: auto-generated from input file and options)
 
----
+### Example
+
+Say we only have these hash lists:
+
+```
+[0, 1, 2, (3)]
+[0, 1]
+[0, 1, 2]
+[0, (4), (5)]
+```
+
+First, we identify the "core prefix nodes" as [0, 1, 2] since they are visited more than once. The nodes [3, 4, 5] would be considered "user prompts" as they only appear once (noted in brackets).
+
+If we set the `depth-multiplier` to 2, then the core prefix branches will be stretched, effectively giving:
+
+```
+[0, 1, 2, 3, 4, 5, 6, (7)]
+[0, 1, 2, 3]
+[0, 1, 2, 3, 4]
+[0, 1, (8), (9)]
+```
+
+
+Note that the "prompt branches" are not stretched by `depth-multiplier`. They can be separately modified by applying `prompt-len-multiplier`.
 
-This directory is currently used for generating synthetic data based on the mooncake dataset, but should be easily extendible to any request datasets with (prefix) hash ids, with a current caveat. The synthesizer is designed to work for jsonl files in the "mooncake" trace file format, meaning that the input are increasing integers of block hashes. For now, new block hashes must be the next consecutive integer, otherwise will not work.
+Now, if we set `width-multiplier` to 2, then each row will have a 50 percent chance of being incremented by a large integer, so that they will be effectively separated into a new radix tree, which matches the statistics of the original one, but having completely different roots.
+
+For example, if rows 2 and 4 are offseted, then we would get:
+
+```
+[0, 1, 2, 3, 4, 5, 6, (7)]
+[10, 11, 12, 13]
+[0, 1, 2, 3, 4]
+[10, 11, (14), (15)]
+```
 
-### How it works
+### Implementation details
 
 The generation algorithm, simplified, is as follows
 

From 3ea2668d4db42b8029a18f89acd557d840daad62 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 17 May 2025 23:23:16 -0700
Subject: [PATCH 28/50] package data_generator

---
 .github/workflows/pre-merge-python.yml        |  6 +-
 benchmarks/__init__.py                        | 14 -----
 benchmarks/data_generator/graph_utils.py      |  5 +-
 benchmarks/data_generator/prefix_analyzer.py  |  2 +-
 benchmarks/data_generator/synthesizer.py      |  9 ++-
 .../data_generator/tests/test_hasher.py       |  3 +-
 .../data_generator/tests/test_sampler.py      |  3 +-
 .../data_generator/tests/test_synthesizer.py  |  2 +-
 benchmarks/pyproject.toml                     | 59 +++++++++++++++++++
 container/deps/requirements.benchmarks.txt    | 17 ------
 10 files changed, 72 insertions(+), 48 deletions(-)
 delete mode 100644 benchmarks/__init__.py
 create mode 100644 benchmarks/pyproject.toml
 delete mode 100644 container/deps/requirements.benchmarks.txt

diff --git a/.github/workflows/pre-merge-python.yml b/.github/workflows/pre-merge-python.yml
index b9385852db..55488270eb 100644
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -53,8 +53,8 @@ jobs:
       - name: Run pytest
         env:
           PYTEST_MARKS: "pre_merge or mypy"
-        run: |
-          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}"
+        run: Run pytest for data_generator
+          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pip install -e ./benchmarks && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}" ./benchmarks/data_generator/tests'
       - name: Copy test report from test Container
         if: always()
         run: |
@@ -77,4 +77,4 @@ jobs:
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:
           name: Event File
-          path: ${{ github.event_path }}
+          path: ${{ github.event_path }}
\ No newline at end of file
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
deleted file mode 100644
index 3159bfe656..0000000000
--- a/benchmarks/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/benchmarks/data_generator/graph_utils.py b/benchmarks/data_generator/graph_utils.py
index 82dbdea9e1..59bb997512 100644
--- a/benchmarks/data_generator/graph_utils.py
+++ b/benchmarks/data_generator/graph_utils.py
@@ -15,9 +15,8 @@
 
 import networkx as nx
 import numpy as np
-
-from benchmarks.data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
-from benchmarks.data_generator.sampler import get_cdf
+from data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
+from data_generator.sampler import get_cdf
 
 
 def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
diff --git a/benchmarks/data_generator/prefix_analyzer.py b/benchmarks/data_generator/prefix_analyzer.py
index 4c19fb5cb7..bb67a6494e 100644
--- a/benchmarks/data_generator/prefix_analyzer.py
+++ b/benchmarks/data_generator/prefix_analyzer.py
@@ -16,7 +16,7 @@
 import json
 from collections import Counter
 
-from benchmarks.data_generator.logging import calculate_and_print_statistics
+from data_generator.logging import calculate_and_print_statistics
 
 
 class PrefixAnalyzer:
diff --git a/benchmarks/data_generator/synthesizer.py b/benchmarks/data_generator/synthesizer.py
index 2d114e61ce..ec62165ab8 100644
--- a/benchmarks/data_generator/synthesizer.py
+++ b/benchmarks/data_generator/synthesizer.py
@@ -20,15 +20,14 @@
 import networkx as nx
 import numpy as np
 import pandas as pd
-
-from benchmarks.data_generator.graph_utils import (
+from data_generator.graph_utils import (
     _merge_chains,
     _precompute_transition_cdfs,
     _remove_leaves,
 )
-from benchmarks.data_generator.logging import calculate_and_print_statistics
-from benchmarks.data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
-from benchmarks.data_generator.sampler import EmpiricalSampler, sample_from_cdf
+from data_generator.logging import calculate_and_print_statistics
+from data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
+from data_generator.sampler import EmpiricalSampler, sample_from_cdf
 
 
 class Synthesizer:
diff --git a/benchmarks/data_generator/tests/test_hasher.py b/benchmarks/data_generator/tests/test_hasher.py
index e5a53cbd6e..40c7ddbf5f 100644
--- a/benchmarks/data_generator/tests/test_hasher.py
+++ b/benchmarks/data_generator/tests/test_hasher.py
@@ -14,11 +14,10 @@
 # limitations under the License.
 
 import pytest
+from data_generator.hasher import texts_to_hashes
 from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers
 from transformers import PreTrainedTokenizerFast
 
-from benchmarks.data_generator.hasher import texts_to_hashes
-
 
 @pytest.fixture(scope="module")
 def dummy_tokenizer():
diff --git a/benchmarks/data_generator/tests/test_sampler.py b/benchmarks/data_generator/tests/test_sampler.py
index 47b3f9c099..f207064300 100644
--- a/benchmarks/data_generator/tests/test_sampler.py
+++ b/benchmarks/data_generator/tests/test_sampler.py
@@ -16,8 +16,7 @@
 from collections import Counter
 
 import numpy as np
-
-from benchmarks.data_generator.sampler import EmpiricalSampler
+from data_generator.sampler import EmpiricalSampler
 
 
 def test_empirical_sampler_distribution():
diff --git a/benchmarks/data_generator/tests/test_synthesizer.py b/benchmarks/data_generator/tests/test_synthesizer.py
index eb8e665ae1..99c18ca9bb 100644
--- a/benchmarks/data_generator/tests/test_synthesizer.py
+++ b/benchmarks/data_generator/tests/test_synthesizer.py
@@ -19,7 +19,7 @@
 import tempfile
 import unittest
 
-from benchmarks.data_generator.synthesizer import Synthesizer
+from data_generator.synthesizer import Synthesizer
 
 
 # Helper function to create and dump data
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
new file mode 100644
index 0000000000..7a392541d9
--- /dev/null
+++ b/benchmarks/pyproject.toml
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[project]
+name = "data-generator"
+version = "0.1.0"
+description = "Data generator library for LLM benchmarks"
+readme = "README.md"
+authors = [
+    {name = "NVIDIA CORPORATION & AFFILIATES"}
+]
+license = {text = "Apache-2.0"}
+requires-python = ">=3.10"
+
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Information Technology",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Operating System :: POSIX :: Linux",
+]
+
+dependencies = [
+    "networkx",
+    "pandas",
+    "transformers",
+]
+
+[project.urls]
+Repository = "https://github.com/ai-dynamo/dynamo.git"
+
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools]
+packages = ["data_generator"]
+
+[tool.setuptools.package-data]
+data_generator = ["**/*.py"]
\ No newline at end of file
diff --git a/container/deps/requirements.benchmarks.txt b/container/deps/requirements.benchmarks.txt
deleted file mode 100644
index 1565b1f5cd..0000000000
--- a/container/deps/requirements.benchmarks.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-networkx
-pandas

From 4e3283467d1ba0b4f0ed672c4144e6a096f1a90c Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 17 May 2025 23:39:34 -0700
Subject: [PATCH 29/50] cli

---
 benchmarks/data_generator/__init__.py        |  6 +++
 benchmarks/data_generator/cli.py             | 52 ++++++++++++++++++++
 benchmarks/data_generator/logging.py         |  4 +-
 benchmarks/data_generator/prefix_analyzer.py |  6 ++-
 benchmarks/data_generator/synthesizer.py     |  9 +++-
 benchmarks/pyproject.toml                    |  4 ++
 6 files changed, 77 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/data_generator/cli.py

diff --git a/benchmarks/data_generator/__init__.py b/benchmarks/data_generator/__init__.py
index 3159bfe656..39f7928448 100644
--- a/benchmarks/data_generator/__init__.py
+++ b/benchmarks/data_generator/__init__.py
@@ -12,3 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from data_generator.cli import main as cli_main
+
+
+def main():
+    cli_main()
diff --git a/benchmarks/data_generator/cli.py b/benchmarks/data_generator/cli.py
new file mode 100644
index 0000000000..7e42b4c17b
--- /dev/null
+++ b/benchmarks/data_generator/cli.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Data generation and analysis tools for benchmarking",
+        prog="datagen",
+    )
+
+    # Add subparsers for commands
+    subparsers = parser.add_subparsers(dest="command", help="Command to run")
+
+    # Create the parser for the "analyze" command
+    subparsers.add_parser("analyze", help="Analyze data")
+
+    # Create the parser for the "synthesize" command
+    subparsers.add_parser("synthesize", help="Synthesize data")
+
+    args, remaining = parser.parse_known_args()
+
+    if args.command == "analyze":
+        # Import and run the analyzer main
+        from data_generator import prefix_analyzer
+
+        sys.argv = [sys.argv[0]] + remaining
+        prefix_analyzer.main()
+    elif args.command == "synthesize":
+        # Import and run the synthesizer main
+        from data_generator import synthesizer
+
+        sys.argv = [sys.argv[0]] + remaining
+        synthesizer.main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/data_generator/logging.py b/benchmarks/data_generator/logging.py
index a6212d87a5..4055e97b1c 100644
--- a/benchmarks/data_generator/logging.py
+++ b/benchmarks/data_generator/logging.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import pandas as pd
+from tabulate import tabulate
 
 
 def calculate_and_print_statistics(metrics: Dict[str, List[float]]) -> pd.DataFrame:
@@ -47,7 +48,8 @@ def calculate_and_print_statistics(metrics: Dict[str, List[float]]) -> pd.DataFr
             }
         )
 
+    # Replace the printing code with tabulate
     stats_df = pd.DataFrame(stats_data, index=metric_names)
-    print(stats_df, "\n")
+    print(tabulate(stats_df, headers="keys", tablefmt="pretty", floatfmt=".2f"), "\n")
 
     return stats_df
diff --git a/benchmarks/data_generator/prefix_analyzer.py b/benchmarks/data_generator/prefix_analyzer.py
index bb67a6494e..bbf3a1baab 100644
--- a/benchmarks/data_generator/prefix_analyzer.py
+++ b/benchmarks/data_generator/prefix_analyzer.py
@@ -153,7 +153,7 @@ def _analyze_cache_hit_rates(self) -> list[float]:
         return cache_hit_rates
 
 
-if __name__ == "__main__":
+def main():
     import argparse
 
     parser = argparse.ArgumentParser(description="Analyze prefix dataset statistics")
@@ -181,3 +181,7 @@ def _analyze_cache_hit_rates(self) -> list[float]:
     # Create analyzer instance
     analyzer = PrefixAnalyzer(dataset_path, block_size=block_size)
     analyzer.analyze()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/data_generator/synthesizer.py b/benchmarks/data_generator/synthesizer.py
index ec62165ab8..db63343aff 100644
--- a/benchmarks/data_generator/synthesizer.py
+++ b/benchmarks/data_generator/synthesizer.py
@@ -25,7 +25,6 @@
     _precompute_transition_cdfs,
     _remove_leaves,
 )
-from data_generator.logging import calculate_and_print_statistics
 from data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
 from data_generator.sampler import EmpiricalSampler, sample_from_cdf
 
@@ -348,10 +347,12 @@ def __repr__(self) -> str:
         return rep
 
 
-if __name__ == "__main__":
+def main():
     import argparse
     from pathlib import Path
 
+    from data_generator.logging import calculate_and_print_statistics
+
     parser = argparse.ArgumentParser(description="Synthesize Mooncake-Esque dataset")
     parser.add_argument(
         "--input-file",
@@ -452,3 +453,7 @@ def __repr__(self) -> str:
         for request in requests:
             f.write(json.dumps(request) + "\n")
     print(f"synthetic dataset saved at {Path(output_file).resolve()}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
index 7a392541d9..01f8ed242c 100644
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -42,9 +42,13 @@ classifiers = [
 dependencies = [
     "networkx",
     "pandas",
+    "tabulate",
     "transformers",
 ]
 
+[project.scripts]
+datagen = "data_generator.cli:main"
+
 [project.urls]
 Repository = "https://github.com/ai-dynamo/dynamo.git"
 

From 06678d8bf87be05bdf38c3f2eb8118faa4b1a2ed Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 17 May 2025 23:48:47 -0700
Subject: [PATCH 30/50] restore accidentally deleted pytest in per-merge

---
 .github/workflows/pre-merge-python.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pre-merge-python.yml b/.github/workflows/pre-merge-python.yml
index 55488270eb..ee225893bb 100644
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -53,7 +53,12 @@ jobs:
       - name: Run pytest
         env:
           PYTEST_MARKS: "pre_merge or mypy"
-        run: Run pytest for data_generator
+        run: |
+          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}"
+      - name: Run pytest for benchmarks
+        env:
+          PYTEST_MARKS: "pre_merge or mypy"
+        run: |
           docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pip install -e ./benchmarks && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}" ./benchmarks/data_generator/tests'
       - name: Copy test report from test Container
         if: always()

From 04f7c987bf42929b8aa4484951ecae97c21ba48f Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 18 May 2025 01:03:13 -0700
Subject: [PATCH 31/50] actually, need to now install data_generator before the
 pytests

---
 .github/workflows/pre-merge-python.yml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.github/workflows/pre-merge-python.yml b/.github/workflows/pre-merge-python.yml
index ee225893bb..e95b91508f 100644
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -54,12 +54,7 @@ jobs:
         env:
           PYTEST_MARKS: "pre_merge or mypy"
         run: |
-          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}"
-      - name: Run pytest for benchmarks
-        env:
-          PYTEST_MARKS: "pre_merge or mypy"
-        run: |
-          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pip install -e ./benchmarks && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}" ./benchmarks/data_generator/tests'
+          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pip install -e . && pip install -e ./benchmarks && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}"'
       - name: Copy test report from test Container
         if: always()
         run: |

From 78ee708baf7b71e233cbbb377c29cc2c50b146cf Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 18 May 2025 22:05:54 -0700
Subject: [PATCH 32/50] fix pytest workflow

---
 .github/workflows/pre-merge-python.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pre-merge-python.yml b/.github/workflows/pre-merge-python.yml
index e95b91508f..113b72d516 100644
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -54,7 +54,7 @@ jobs:
         env:
           PYTEST_MARKS: "pre_merge or mypy"
         run: |
-          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pip install -e . && pip install -e ./benchmarks && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}"'
+          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pip install -e ./benchmarks && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}"'
       - name: Copy test report from test Container
         if: always()
         run: |

From ac4068623d249570a41584b33f483e1e73ceed1b Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 18 May 2025 22:56:54 -0700
Subject: [PATCH 33/50] pytest mypy

---
 benchmarks/pyproject.toml | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
index 01f8ed242c..aa98d6e7d5 100644
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
     "pandas",
     "tabulate",
     "transformers",
+    "pytest-mypy",
 ]
 
 [project.scripts]
@@ -60,4 +61,21 @@ build-backend = "setuptools.build_meta"
 packages = ["data_generator"]
 
 [tool.setuptools.package-data]
-data_generator = ["**/*.py"]
\ No newline at end of file
+data_generator = ["**/*.py"]
+
+[tool.mypy]
+explicit_package_bases = true
+ignore_missing_imports = true
+check_untyped_defs = true
+
+[tool.pytest.ini_options]
+addopts = [
+    "-ra",
+    "--showlocals",
+    "--strict-markers",
+    "--strict-config",
+    "--mypy",  # This flag enables mypy type checking during pytest runs
+    "--ignore-glob=*model.py",
+    "--ignore-glob=*_inc.py",
+    "--ignore-glob=deploy/cloud/api-store/*",
+]
\ No newline at end of file

From 5cbac2aa6f86435e2d1b10f9ac54ad182259a481 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 18 May 2025 23:54:01 -0700
Subject: [PATCH 34/50] update README with cli

---
 benchmarks/data_generator/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/data_generator/README.md b/benchmarks/data_generator/README.md
index 6ef4a32608..01af69689d 100644
--- a/benchmarks/data_generator/README.md
+++ b/benchmarks/data_generator/README.md
@@ -19,7 +19,7 @@ The Prefix Analyzer provides statistics on the original trace file, such as Inpu
 It is useful for understanding the structure and reuse patterns in your dataset.
 
 ```bash
-python -m benchmarks.data_utils.prefix_analyzer --input-file <path_to_trace.jsonl> --block-size <block_size>
+datagen analyze --input-file <path_to_trace.jsonl> --block-size <block_size>
 ```
 
 - `--input-file`: Path to your trace file in jsonl format (default: `mooncake_trace.jsonl`)
@@ -48,7 +48,7 @@ This is useful for generating large, realistic synthetic traces for benchmarking
 ### How to run
 
 ```bash
-python -m benchmarks.data_utils.synthesizer --input-file <path_to_trace.jsonl> --num-requests <N> [other options...]
+datagen synthesize --input-file <path_to_trace.jsonl> --num-requests <N> [other options...]
 ```
 
 **Options:**

From cac7be45f47256071b7118c2e8d287802120ce65 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 18 May 2025 23:56:20 -0700
Subject: [PATCH 35/50] mypy --install-types

---
 .github/workflows/pre-merge-python.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pre-merge-python.yml b/.github/workflows/pre-merge-python.yml
index 113b72d516..9e47a404d3 100644
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -54,7 +54,7 @@ jobs:
         env:
           PYTEST_MARKS: "pre_merge or mypy"
         run: |
-          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pip install -e ./benchmarks && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}"'
+          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pip install -e ./benchmarks && mypy --install-types && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}"'
       - name: Copy test report from test Container
         if: always()
         run: |

From 3b3fa594ca8b3c86dd2111426364c3e737106f6b Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Mon, 19 May 2025 19:32:56 -0700
Subject: [PATCH 36/50] make pytest ignore benchmarks

---
 .github/workflows/pre-merge-python.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pre-merge-python.yml b/.github/workflows/pre-merge-python.yml
index 9e47a404d3..fec34cb63c 100644
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -54,7 +54,7 @@ jobs:
         env:
           PYTEST_MARKS: "pre_merge or mypy"
         run: |
-          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pip install -e ./benchmarks && mypy --install-types && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}"'
+          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}" --ignore=benchmarks'
       - name: Copy test report from test Container
         if: always()
         run: |

From 8985f5870d5e20e59b0eab68565ec940481ab40d Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Mon, 19 May 2025 19:34:48 -0700
Subject: [PATCH 37/50] remove bash -ec

---
 .github/workflows/pre-merge-python.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pre-merge-python.yml b/.github/workflows/pre-merge-python.yml
index fec34cb63c..b2cd6d2cf0 100644
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -54,7 +54,7 @@ jobs:
         env:
           PYTEST_MARKS: "pre_merge or mypy"
         run: |
-          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -ec 'pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}" --ignore=benchmarks'
+          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}" --ignore=benchmarks
       - name: Copy test report from test Container
         if: always()
         run: |

From a248cc54d0f3d3ff955e01cfa08c98392d4303c0 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Thu, 22 May 2025 23:40:19 -0700
Subject: [PATCH 38/50] short README in benchmarks

---
 benchmarks/README.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 benchmarks/README.md

diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000000..c648491722
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,30 @@
+<!-- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# Benchmarks
+
+This directory contains benchmarking scripts and tools for performance evaluation of key system components including KV routing, disaggregation, and the Planner.
+
+## Installation
+
+To install the necessary dependencies locally, run:
+
+```bash
+pip install -e .
+```
+
+Currently, this will install lightweight tools for:
+- Analyzing structured data with desired prefix structures (`datagen analyze`)
+- Synthesizing structured data for testing purposes (`datagen synthesize`)
\ No newline at end of file

From 76586877cd89a670e280f9670af1ad0b3e2958cf Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Thu, 22 May 2025 23:42:14 -0700
Subject: [PATCH 39/50] minor language cleanups

---
 benchmarks/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index c648491722..71568a23ba 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -15,7 +15,7 @@
 
 # Benchmarks
 
-This directory contains benchmarking scripts and tools for performance evaluation of key system components including KV routing, disaggregation, and the Planner.
+This directory contains (will contain) benchmarking scripts and tools for performance evaluation of key system components including KV routing, disaggregation, and the Planner.
 
 ## Installation
 
@@ -26,5 +26,5 @@ pip install -e .
 ```
 
 Currently, this will install lightweight tools for:
-- Analyzing structured data with desired prefix structures (`datagen analyze`)
-- Synthesizing structured data for testing purposes (`datagen synthesize`)
\ No newline at end of file
+- Analyzing prefix-structured data (`datagen analyze`)
+- Synthesizing structured data customizable for testing purposes (`datagen synthesize`)
\ No newline at end of file

From f5427097808ee27fa5e71eb766d3382b6fa6d102 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Thu, 22 May 2025 23:47:31 -0700
Subject: [PATCH 40/50] reference benchmarks in KV tuning guide

---
 docs/guides/kv_router_perf_tuning.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/guides/kv_router_perf_tuning.md b/docs/guides/kv_router_perf_tuning.md
index 8e3806b68f..8b33ba1a8a 100644
--- a/docs/guides/kv_router_perf_tuning.md
+++ b/docs/guides/kv_router_perf_tuning.md
@@ -65,6 +65,8 @@ Such saturation can create a feedback loop—where the cache-rich worker continu
 
 ## Tuning Guidelines
 
+Currently, optimal use of our KV router requires understanding your backend engine's capacity and the prefix structure of your data. We provide analysis tools for this purpose in the `benchmarks` directory. In the future, we plan to enable automatic tuning of our KV router (via `Planner`) using worker feedback metrics and dynamic analysis of data prefix structures (WIP).
+
 ### 1. Consider Total KV Block Allocation
 
 Check the total number of KV blocks allocated for your backend engine. For smaller models (e.g., 8B parameters), this can exceed one million blocks. In such cases:

From d627b2b30daf800ad05c9a16eb8e7ea73ca46b65 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Thu, 22 May 2025 23:49:02 -0700
Subject: [PATCH 41/50] better writing

---
 docs/guides/kv_router_perf_tuning.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/kv_router_perf_tuning.md b/docs/guides/kv_router_perf_tuning.md
index 8b33ba1a8a..44ac19e708 100644
--- a/docs/guides/kv_router_perf_tuning.md
+++ b/docs/guides/kv_router_perf_tuning.md
@@ -65,7 +65,7 @@ Such saturation can create a feedback loop—where the cache-rich worker continu
 
 ## Tuning Guidelines
 
-Currently, optimal use of our KV router requires understanding your backend engine's capacity and the prefix structure of your data. We provide analysis tools for this purpose in the `benchmarks` directory. In the future, we plan to enable automatic tuning of our KV router (via `Planner`) using worker feedback metrics and dynamic analysis of data prefix structures (WIP).
+Currently, optimal use of our KV router requires understanding your backend engine's capacity and the prefix structure of your data. We provide analysis tools for this purpose in the `benchmarks` directory. In the future, we plan to enable automatic tuning of our KV router (via `Planner`) using worker feedback metrics and dynamic analysis of data prefix structures (WIP). Below are several tips we recommend following.
 
 ### 1. Consider Total KV Block Allocation
 

From 14b54c5dbe5a94b62853facee3a9297e0ad4ef5e Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Fri, 23 May 2025 00:03:14 -0700
Subject: [PATCH 42/50] docstrings

---
 benchmarks/data_generator/graph_utils.py | 32 +++++++++++++++++++++---
 benchmarks/data_generator/sampler.py     |  7 ++++++
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/benchmarks/data_generator/graph_utils.py b/benchmarks/data_generator/graph_utils.py
index 59bb997512..3f4a2b672c 100644
--- a/benchmarks/data_generator/graph_utils.py
+++ b/benchmarks/data_generator/graph_utils.py
@@ -20,14 +20,24 @@
 
 
 def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
-    """Make the graph radix-like (meaning all unary paths are contracted).
-    In addition, keep track of the contracted lengths.
+    """
+    Make the graph radix-like (meaning all unary paths are contracted).
+
+    This function transforms a prefix tree into a radix tree structure by contracting
+    unary paths (chains of nodes with exactly one predecessor and one successor).
+    The resulting radix tree is significantly more compact than the original prefix tree,
+    as it eliminates redundant intermediate nodes while preserving the structural
+    information needed for path sampling.
+
+    This compression is particularly beneficial for efficient path sampling during data
+    synthesis. In addition, keep track of the contracted lengths in the 'length' attribute
+    of each node to preserve the original path information.
 
     Args:
         G (networkx.DiGraph): A directed graph representing a prefix tree structure.
 
     Returns:
-        networkx.DiGraph: The modified graph with unary paths contracted.
+        networkx.DiGraph: The resulting radix tree with unary paths contracted.
     """
     for visited in sorted(np.unique([G.nodes[node]["visited"] for node in G.nodes()])):
         sub_nodes = [node for node in G.nodes() if G.nodes[node]["visited"] == visited]
@@ -76,6 +86,22 @@ def _merge_chains(G: nx.DiGraph) -> nx.DiGraph:
 
 
 def _remove_leaves(G: nx.DiGraph) -> tuple[nx.DiGraph, list[int]]:
+    """
+    Remove all nodes that are only visited once from the tree.
+
+    This function removes nodes representing unique user prompts (nodes with visited=1)
+    from the radix tree, leaving only the "core radix tree" structure that contains
+    commonly traversed paths. The removed nodes typically represent leaf paths that
+    were accessed only once and don't contribute to the core structural patterns.
+
+    Args:
+        G (networkx.DiGraph): A directed graph representing a radix tree structure.
+
+    Returns:
+        tuple[networkx.DiGraph, list[int]]: A tuple containing:
+            - The modified graph with unique nodes removed
+            - A list of lengths of the removed leaf nodes
+    """
     leaves = {
         node: G.nodes[node]["length"]
         for node in G.nodes()
diff --git a/benchmarks/data_generator/sampler.py b/benchmarks/data_generator/sampler.py
index 803e4148e3..96839471fb 100644
--- a/benchmarks/data_generator/sampler.py
+++ b/benchmarks/data_generator/sampler.py
@@ -48,6 +48,13 @@ def sample_from_cdf(
 
 
 class EmpiricalSampler:
+    """
+    Takes data, learns from the pure empirical distribution, and samples directly from it.
+
+    Args:
+        data (Union[List[Any], np.ndarray]): The input data to learn the distribution from.
+    """
+
     def __init__(self, data: Union[List[Any], np.ndarray]) -> None:
         self.rng = np.random.default_rng(0)
         self.empty_data = len(data) == 0

From 5defd1215243f52f7b84fed341651d9d5d64a8cf Mon Sep 17 00:00:00 2001
From: Yan Ru Pei <yanrpei@gmail.com>
Date: Sat, 24 May 2025 03:54:46 +0800
Subject: [PATCH 43/50] Update benchmarks/README.md

Co-authored-by: Neelay Shah <neelays@nvidia.com>
Signed-off-by: Yan Ru Pei <yanrpei@gmail.com>
---
 benchmarks/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 71568a23ba..34f1f8e1b9 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -15,7 +15,7 @@
 
 # Benchmarks
 
-This directory contains (will contain) benchmarking scripts and tools for performance evaluation of key system components including KV routing, disaggregation, and the Planner.
+This directory contains benchmarking scripts and tools for performance evaluation.
 
 ## Installation
 

From c7ac84cf475aeff1096c93d7cd2fd796cdd587d2 Mon Sep 17 00:00:00 2001
From: Yan Ru Pei <yanrpei@gmail.com>
Date: Sat, 24 May 2025 03:55:10 +0800
Subject: [PATCH 44/50] Update benchmarks/data_generator/README.md

Co-authored-by: Neelay Shah <neelays@nvidia.com>
Signed-off-by: Yan Ru Pei <yanrpei@gmail.com>
---
 benchmarks/data_generator/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/data_generator/README.md b/benchmarks/data_generator/README.md
index 01af69689d..d49e33965e 100644
--- a/benchmarks/data_generator/README.md
+++ b/benchmarks/data_generator/README.md
@@ -15,7 +15,7 @@
 
 ## Prefix Analyzer
 
-The Prefix Analyzer provides statistics on the original trace file, such as Input Sequence Length (ISL), Output Sequence Length (OSL), and theoretical cache hit rate.
+The Prefix Analyzer provides statistics on a trace file, such as Input Sequence Length (ISL), Output Sequence Length (OSL), and theoretical cache hit rate.
 It is useful for understanding the structure and reuse patterns in your dataset.
 
 ```bash

From c346b8f9edede9e4fee1503c33a9caa40ab74a3d Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Fri, 23 May 2025 13:10:58 -0700
Subject: [PATCH 45/50] improve readability

---
 benchmarks/data_generator/README.md      | 47 ++++++++++++++----------
 benchmarks/data_generator/synthesizer.py | 26 ++++++-------
 2 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/benchmarks/data_generator/README.md b/benchmarks/data_generator/README.md
index d49e33965e..83419180dc 100644
--- a/benchmarks/data_generator/README.md
+++ b/benchmarks/data_generator/README.md
@@ -13,6 +13,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License. -->
 
+## Trace File Format
+
+The following tools help analyze and synthesize new data based on the [mooncake trace file format](https://github.com/kvcache-ai/Mooncake/blob/d21da178bae8db9651cf18a76824c084145fc725/mooncake_trace.jsonl). In this format, the first few lines would look like this, for example:
+
+```
+{"timestamp": 0, "input_length": 6755, "output_length": 500, "hash_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]}
+{"timestamp": 0, "input_length": 7319, "output_length": 490, "hash_ids": [0, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]}
+{"timestamp": 3052, "input_length": 7234, "output_length": 794, "hash_ids": [0, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41]}
+{"timestamp": 3052, "input_length": 2287, "output_length": 316, "hash_ids": [0, 42, 43, 44, 45]}
+```
+
+**Hash ID Generation:** Each new hash ID is the next consecutive integer after the last one used. To generate these increasing hash IDs from a list of texts, we provide the `texts_to_hashes` function in `hasher.py`.
+
+**Timestamp:** The arrival time (in milliseconds) of the request, which can be the same for multiple requests arriving simultaneously.
+
+**Block Size and Hash IDs:** In this example, the `block_size` (the page size of the KV cache) is assumed to be 512. The length of the `hash_ids` array equals `input_length // block_size`.
+
 ## Prefix Analyzer
 
 The Prefix Analyzer provides statistics on a trace file, such as Input Sequence Length (ISL), Output Sequence Length (OSL), and theoretical cache hit rate.
@@ -27,16 +44,6 @@ datagen analyze --input-file <path_to_trace.jsonl> --block-size <block_size>
 
 The script will print out summary statistics for ISL, OSL, user prompt lengths, and the theoretical cache hit rate (assuming an infinite cache).
 
-The trace file is expected to be in the [mooncake trace file format](https://github.com/kvcache-ai/Mooncake/blob/d21da178bae8db9651cf18a76824c084145fc725/mooncake_trace.jsonl). For example, the first few lines would look like this:
-
-```
-{"timestamp": 0, "input_length": 6755, "output_length": 500, "hash_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]}
-{"timestamp": 0, "input_length": 7319, "output_length": 490, "hash_ids": [0, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]}
-{"timestamp": 0, "input_length": 7234, "output_length": 794, "hash_ids": [0, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41]}
-{"timestamp": 0, "input_length": 2287, "output_length": 316, "hash_ids": [0, 42, 43, 44, 45]}
-```
-Note that each new hash id is the next consecutive integer after the last one used. To generate these increasing hash ids from a list of texts, we provide the `texts_to_hashes` function in `hasher.py`.
-
 ## Synthesizer
 
 The Synthesizer goes a step further:
@@ -55,8 +62,8 @@ datagen synthesize --input-file <path_to_trace.jsonl> --num-requests <N> [other
 - `--input-file`: Path to the input trace file (default: `mooncake_trace.jsonl`)
 - `--num-requests`: Number of requests to synthesize (default: 100000)
 - `--speedup-ratio`: Factor to speed up request intervals. It effectively divides the synthetic timestamps by this value (default: 1)
-- `--depth-multiplier`: Multiplier for prefix lengths (default: 1.0)
-- `--width-multiplier`: Number of times to replicate the core radix tree (default: 1)
+- `--prefix-len-multiplier`: Multiplier for prefix lengths (default: 1.0)
+- `--prefix-root-multiplier`: Number of times to replicate the core radix tree (default: 1)
 - `--prompt-len-multiplier`: Multiplier for leaf path lengths (default: 1.0, use <1 for shorter prompts)
 - `--max-isl`: Maximum input sequence length to include in output (default: None, no filtering)
 - `--block-size`: Block size for prefilling and decoding (default: 512)
@@ -75,26 +82,26 @@ Say we only have these hash lists:
 
 First, we identify the "core prefix nodes" as [0, 1, 2] since they are visited more than once. The nodes [3, 4, 5] would be considered "user prompts" as they only appear once (noted in brackets).
 
-If we set the `depth-multiplier` to 2, then the core prefix branches will be stretched, effectively giving:
+If we set the `prefix-len-multiplier` to 2, then the core prefix branches will be stretched, effectively giving:
 
 ```
-[0, 1, 2, 3, 4, 5, 6, (7)]
+[0, 1, 2, 3, 4, 5, (6)]
 [0, 1, 2, 3]
-[0, 1, 2, 3, 4]
-[0, 1, (8), (9)]
+[0, 1, 2, 3, 4, 5]
+[0, 1, (7), (8)]
 ```
 
 
-Note that the "prompt branches" are not stretched by `depth-multiplier`. They can be separately modified by applying `prompt-len-multiplier`.
+Note that the "prompt branches" are not stretched by `prefix-len-multiplier`. They can be separately modified by applying `prompt-len-multiplier`.
 
-Now, if we set `width-multiplier` to 2, then each row will have a 50 percent chance of being incremented by a large integer, so that they will be effectively separated into a new radix tree, which matches the statistics of the original one, but having completely different roots.
+Now, if we set `prefix-root-multiplier` to 2, then each row will have a 50 percent chance of being incremented by a large integer, so that they will be effectively separated into a new radix tree, which matches the statistics of the original one, but having completely different roots.
 
 For example, if rows 2 and 4 are offseted, then we would get:
 
 ```
-[0, 1, 2, 3, 4, 5, 6, (7)]
+[0, 1, 2, 3, 4, 5, (6)]
 [10, 11, 12, 13]
-[0, 1, 2, 3, 4]
+[0, 1, 2, 3, 4, 5]
 [10, 11, (14), (15)]
 ```
 
diff --git a/benchmarks/data_generator/synthesizer.py b/benchmarks/data_generator/synthesizer.py
index db63343aff..fdb3846463 100644
--- a/benchmarks/data_generator/synthesizer.py
+++ b/benchmarks/data_generator/synthesizer.py
@@ -36,7 +36,7 @@ def __init__(
         block_size: int = 512,
         num_copies: int = 1,
         speedup_ratio: float = 1.0,
-        context_len_multiplier: float = 1.0,
+        prefix_len_multiplier: float = 1.0,
         prompt_len_multiplier: float = 1.0,
     ):
         """Load the mooncake dataset and extract core statistics like
@@ -74,7 +74,7 @@ def __init__(
         self.block_size = block_size
         self.num_copies = num_copies
         self.speedup_ratio = float(speedup_ratio)
-        self.context_len_multiplier = float(context_len_multiplier)
+        self.prefix_len_multiplier = float(prefix_len_multiplier)
         self.prompt_len_multiplier = float(prompt_len_multiplier)
 
         # assert correct arg bounds
@@ -85,8 +85,8 @@ def __init__(
             isinstance(self.speedup_ratio, float) and self.speedup_ratio > 0
         ), "speedup_ratio must be a positive float"
         assert (
-            isinstance(self.context_len_multiplier, float)
-            and self.context_len_multiplier > 0
+            isinstance(self.prefix_len_multiplier, float)
+            and self.prefix_len_multiplier > 0
         ), "context_len_multiplier must be a positive float"
         assert (
             isinstance(self.prompt_len_multiplier, float)
@@ -187,8 +187,8 @@ def __init__(
 
     def _relabel_nodes(self) -> None:
         # Scale node labels by length multiplier if needed
-        if self.context_len_multiplier > 1:
-            multiplier = int(np.ceil(self.context_len_multiplier))
+        if self.prefix_len_multiplier > 1:
+            multiplier = int(np.ceil(self.prefix_len_multiplier))
 
             # Create mapping for relabeling, preserving -1 and -2
             mapping = {
@@ -200,10 +200,10 @@ def _relabel_nodes(self) -> None:
             self.max_hash_id = multiplier * self.max_hash_id + multiplier
 
         # Shrink the lengths, but no need to relabel nodes
-        elif self.context_len_multiplier < 1:
+        elif self.prefix_len_multiplier < 1:
             for node in self.G.nodes():
                 self.G.nodes[node]["length"] = max(
-                    round(self.G.nodes[node]["length"] * self.context_len_multiplier), 1
+                    round(self.G.nodes[node]["length"] * self.prefix_len_multiplier), 1
                 )
 
     def _synthesize_leaf_path(self) -> list[int]:
@@ -373,13 +373,13 @@ def main():
         help="Factor to speed up request intervals (default: 1)",
     )
     parser.add_argument(
-        "--depth-multiplier",
+        "--prefix-len-multiplier",
         type=float,
         default=1.0,
         help="Multiplier for prefix lengths (default: 1.0)",
     )
     parser.add_argument(
-        "--width-multiplier",
+        "--prefix-root-multiplier",
         type=int,
         default=1,
         help="Number of times to replicate the core radix tree (default: 1)",
@@ -414,7 +414,7 @@ def main():
     if args.output_file is None:
         output_file = dataset_file.with_stem(
             f"{dataset_file.stem}_synth"
-            + f"_{int(args.depth_multiplier)}x{args.width_multiplier}+{args.prompt_len_multiplier}"
+            + f"_{int(args.prefix_len_multiplier)}x{args.prefix_root_multiplier}+{args.prompt_len_multiplier}"
             + f"_speedup{args.speedup_ratio}"
             + f"_maxisl{args.max_isl}"
         )
@@ -426,8 +426,8 @@ def main():
         str(dataset_file),
         block_size=args.block_size,
         speedup_ratio=args.speedup_ratio,
-        context_len_multiplier=args.depth_multiplier,
-        num_copies=args.width_multiplier,
+        prefix_len_multiplier=args.prefix_len_multiplier,
+        num_copies=args.prefix_root_multiplier,
         prompt_len_multiplier=args.prompt_len_multiplier,
     )
 

From e380c42e393b3ed066788ab0d42b5d8fe35382c7 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Fri, 23 May 2025 13:14:11 -0700
Subject: [PATCH 46/50] more info on hash_ids prefix overlap

---
 benchmarks/data_generator/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/data_generator/README.md b/benchmarks/data_generator/README.md
index 83419180dc..ece3b25d13 100644
--- a/benchmarks/data_generator/README.md
+++ b/benchmarks/data_generator/README.md
@@ -24,9 +24,9 @@ The following tools help analyze and synthesize new data based on the [mooncake
 {"timestamp": 3052, "input_length": 2287, "output_length": 316, "hash_ids": [0, 42, 43, 44, 45]}
 ```
 
-**Hash ID Generation:** Each new hash ID is the next consecutive integer after the last one used. To generate these increasing hash IDs from a list of texts, we provide the `texts_to_hashes` function in `hasher.py`.
+**Hash ID Generation:** Each new hash ID is the next consecutive integer after the last one used. Two `hash_ids` sharing the same integers represents the prefix overlap. To generate these increasing hash IDs from a list of texts, we provide the `texts_to_hashes` function in `hasher.py`.
 
-**Timestamp:** The arrival time (in milliseconds) of the request, which can be the same for multiple requests arriving simultaneously.
+**Timestamp:** The arrival time (in milliseconds) of the request since the first request, which can be the same for multiple requests arriving simultaneously.
 
 **Block Size and Hash IDs:** In this example, the `block_size` (the page size of the KV cache) is assumed to be 512. The length of the `hash_ids` array equals `input_length // block_size`.
 

From 2490bc1805473e6fd3b1a0f39fa9cee148e606ce Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Mon, 26 May 2025 23:58:33 -0700
Subject: [PATCH 47/50] try adding benchmarks dir to python path

---
 .github/workflows/pre-merge-python.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pre-merge-python.yml b/.github/workflows/pre-merge-python.yml
index b2cd6d2cf0..0e73a27c51 100644
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -54,7 +54,7 @@ jobs:
         env:
           PYTEST_MARKS: "pre_merge or mypy"
         run: |
-          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m "${{ env.PYTEST_MARKS }}" --ignore=benchmarks
+          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -c "export PYTHONPATH=/workspace/benchmarks:\$PYTHONPATH && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\""
       - name: Copy test report from test Container
         if: always()
         run: |

From bef882e56c731678ed6afd2b984df12ce03fb057 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 27 May 2025 14:24:23 -0700
Subject: [PATCH 48/50] pip install reqs before pytest benchmarks

---
 .github/workflows/pre-merge-python.yml |  2 +-
 benchmarks/requirements.benchmarks.txt | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/requirements.benchmarks.txt

diff --git a/.github/workflows/pre-merge-python.yml b/.github/workflows/pre-merge-python.yml
index 0e73a27c51..ec99aa1451 100644
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -54,7 +54,7 @@ jobs:
         env:
           PYTEST_MARKS: "pre_merge or mypy"
         run: |
-          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -c "export PYTHONPATH=/workspace/benchmarks:\$PYTHONPATH && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\""
+          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -c "export PYTHONPATH=/workspace/benchmarks:\$PYTHONPATH && pip install -r /workspace/benchmarks/requirements.benchmarks.txt && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\""
       - name: Copy test report from test Container
         if: always()
         run: |
diff --git a/benchmarks/requirements.benchmarks.txt b/benchmarks/requirements.benchmarks.txt
new file mode 100644
index 0000000000..435d3739d1
--- /dev/null
+++ b/benchmarks/requirements.benchmarks.txt
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+networkx
+pandas
+tabulate

From 577de3d5197c516151b46ea191065bee58c459b3 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 27 May 2025 15:27:42 -0700
Subject: [PATCH 49/50] types-tabulate

---
 benchmarks/requirements.benchmarks.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/requirements.benchmarks.txt b/benchmarks/requirements.benchmarks.txt
index 435d3739d1..94891f40c1 100644
--- a/benchmarks/requirements.benchmarks.txt
+++ b/benchmarks/requirements.benchmarks.txt
@@ -16,3 +16,4 @@
 networkx
 pandas
 tabulate
+types-tabulate

From bbe5793887578edfc041b4d571eeb55a31afa90a Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 27 May 2025 16:09:54 -0700
Subject: [PATCH 50/50] restore pip install benchmarks

---
 .github/workflows/pre-merge-python.yml |  2 +-
 benchmarks/pyproject.toml              |  1 +
 benchmarks/requirements.benchmarks.txt | 19 -------------------
 3 files changed, 2 insertions(+), 20 deletions(-)
 delete mode 100644 benchmarks/requirements.benchmarks.txt

diff --git a/.github/workflows/pre-merge-python.yml b/.github/workflows/pre-merge-python.yml
index ec99aa1451..e5c92f66a3 100644
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -54,7 +54,7 @@ jobs:
         env:
           PYTEST_MARKS: "pre_merge or mypy"
         run: |
-          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -c "export PYTHONPATH=/workspace/benchmarks:\$PYTHONPATH && pip install -r /workspace/benchmarks/requirements.benchmarks.txt && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\""
+          docker run -w /workspace --name ${{ env.CONTAINER_ID }}_pytest ${{ steps.define_image_tag.outputs.image_tag }} bash -c "pip install -e /workspace/benchmarks && pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\""
       - name: Copy test report from test Container
         if: always()
         run: |
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
index aa98d6e7d5..76256cfc50 100644
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -43,6 +43,7 @@ dependencies = [
     "networkx",
     "pandas",
     "tabulate",
+    "types-tabulate",
     "transformers",
     "pytest-mypy",
 ]
diff --git a/benchmarks/requirements.benchmarks.txt b/benchmarks/requirements.benchmarks.txt
deleted file mode 100644
index 94891f40c1..0000000000
--- a/benchmarks/requirements.benchmarks.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-networkx
-pandas
-tabulate
-types-tabulate