feat: generate random texts from hashes using lorem ipsum (#1458)

PeaBrane · coderabbitai[bot] · web-flow · commit 1f59718f1464 · 2025-06-10T22:05:21.000-07:00
Signed-off-by: Yan Ru Pei &lt;yanrpei@gmail.com&gt;
Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -19,12 +19,15 @@ This directory contains benchmarking scripts and tools for performance evaluatio
 
 ## Installation
 
-To install the necessary dependencies locally, run:
+This is already included as part of the dynamo vllm image. To install locally or standalone, run:
 
 ```bash
 pip install -e .
 ```
 
 Currently, this will install lightweight tools for:
 - Analyzing prefix-structured data (`datagen analyze`)
-- Synthesizing structured data customizable for testing purposes (`datagen synthesize`)
+- Synthesizing structured data customizable for testing purposes (`datagen synthesize`)
+Detailed information are provided in the `data_generator` directory.
+
+The benchmarking scripts for the core dynamo components are to come soon (e.g. routing, disagg, Planner).
diff --git a/benchmarks/data_generator/README.md b/benchmarks/data_generator/README.md
@@ -13,6 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License. -->
 
+## Quickstart
+
+`example.py` contains an example workflow guiding through synthesizing new requests based on the mooncake trace file. It touches on the core components of this directory.
+
 ## Trace File Format
 
 The following tools help analyze and synthesize new data based on the [mooncake trace file format](https://github.com/kvcache-ai/Mooncake/blob/d21da178bae8db9651cf18a76824c084145fc725/mooncake_trace.jsonl). In this format, the first few lines would look like this, for example:
@@ -26,10 +30,14 @@ The following tools help analyze and synthesize new data based on the [mooncake
 
 **Hash ID Generation:** Each new hash ID is the next consecutive integer after the last one used. Two `hash_ids` sharing the same integers represents the prefix overlap. To generate these increasing hash IDs from a list of texts, we provide the `texts_to_hashes` function in `hasher.py`.
 
+> [!note]The `hashes_to_texts` function can then be used to generate back random texts from these hash IDs sampling from Lorem Ipsum.
+
 **Timestamp:** The arrival time (in milliseconds) of the request since the first request, which can be the same for multiple requests arriving simultaneously.
 
 **Block Size and Hash IDs:** In this example, the `block_size` (the page size of the KV cache) is assumed to be 512. The length of the `hash_ids` array equals `input_length // block_size`.
 
+A general workflow can use `texts_to_hashes` to convert texts to hashes, then use `datagen synthesize` to generate new hashes, then use `hashes_to_texts` to convert them back to random texts.
+
 ## Prefix Analyzer
 
 The Prefix Analyzer provides statistics on a trace file, such as Input Sequence Length (ISL), Output Sequence Length (OSL), and theoretical cache hit rate.
diff --git a/benchmarks/data_generator/example.py b/benchmarks/data_generator/example.py
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import tempfile
+
+import requests
+from data_generator.hasher import hashes_to_texts
+from data_generator.synthesizer import Synthesizer
+
+# download the mooncake trace file
+mooncake_trace_permalink = "https://raw.githubusercontent.com/kvcache-ai/Mooncake/f09c501b2a5d73e4d60cdeb612d7d0d54e1ec228/mooncake_trace.jsonl"
+with tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl", mode="w+b") as tmp_file:
+    response = requests.get(mooncake_trace_permalink)
+    tmp_file.write(response.content)
+    trace_file = tmp_file.name
+
+
+# create the synthesizer
+synthesizer = Synthesizer(
+    dataset_file=trace_file,
+    block_size=512,  # it has to be this, as determined by the mooncake trace
+    speedup_ratio=2,  # the requests will be sent twice as fast
+    prefix_root_multiplier=4,  # will generate 4 separate prefix roots
+    prefix_len_multiplier=4,  # prefix lengths 4 times as long
+    prompt_len_multiplier=0.5,  # shorten prompt lengths to make prefix ratio even larger
+)
+
+# generate requests
+requests_synth = synthesizer.synthesize_requests(
+    num_requests=100,
+    input_len_filter=(
+        16384 - 1000
+    ),  # this is what most model defaults to, leaving some room for outpputs
+)
+
+# convert the hashes into random texts (lorem ipsum), respecting the prefix structure
+tokenizer = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+input_texts = hashes_to_texts(
+    tokenizer=tokenizer,
+    hash_ids_list=[req["hash_ids"] for req in requests_synth],
+    input_lengths=[req["input_length"] for req in requests_synth],
+    block_size=512,
+)
+
+for i, req in enumerate(requests_synth):
+    req["input_text"] = input_texts[i]
+    del req["hash_ids"]
+
+output_file = "synthesized_requests.jsonl"
+with open("synthesized_requests.jsonl", "w") as f:
+    for req in requests_synth:
+        f.write(json.dumps(req) + "\n")
+
+print(f"Saved {len(requests_synth)} requests to {output_file}")
diff --git a/benchmarks/data_generator/hasher.py b/benchmarks/data_generator/hasher.py
@@ -13,27 +13,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, List
+import re
+from typing import Dict, List, Union, cast
 
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+import numpy as np
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+lorem_text = (
+    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor "
+    "incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis "
+    "nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. "
+    "Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore "
+    "eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt "
+    "in culpa qui officia deserunt mollit anim id est laborum."
+)
+words = np.array(list(set(re.findall(r"\b[a-zA-Z]+\b", lorem_text))))
 
 
 def texts_to_hashes(
-    tokenizer: PreTrainedTokenizerBase, texts: List[str], block_size: int = 512
+    tokenizer: Union[str, PreTrainedTokenizerBase],
+    texts: List[str],
+    block_size: int = 512,
 ) -> List[List[int]]:
     """
     Tokenizes a list of strings (without special tokens), splits tokens into blocks,
     computes rolling hashes, and returns a list of lists of integer-mapped rolling hashes
     for each input string.
 
     Args:
-        tokenizer: Tokenizer object with a .encode method.
+        tokenizer: Tokenizer object with a .encode method or string name to load from HuggingFace.
         texts (List[str]): List of input strings.
         block_size (int): Size of each token block for hashing.
 
     Returns:
         List[List[int]]: List of lists of integer-mapped rolling hashes for each block of each input string.
     """
+    # Load tokenizer if string is provided
+    if isinstance(tokenizer, str):
+        tokenizer = cast(
+            PreTrainedTokenizerBase, AutoTokenizer.from_pretrained(tokenizer)
+        )
+
     # Batch tokenize for efficiency
     batch_encoding = tokenizer(
         texts,
@@ -71,3 +91,76 @@ def texts_to_hashes(
         results.append(hashes)
 
     return results
+
+
+def hashes_to_texts(
+    tokenizer: Union[str, PreTrainedTokenizerBase],
+    hash_ids_list: List[List[int]],
+    input_lengths: List[int],
+    block_size: int = 512,
+) -> List[str]:
+    """
+    Converts a list of hash ID sequences back to text strings using a global token mapping.
+
+    Args:
+        tokenizer: Tokenizer object with a .decode method or string name to load from HuggingFace.
+        hash_ids_list (List[List[int]]): List of hash ID sequences for each input.
+        input_lengths (List[int]): Target input lengths for each sequence.
+        block_size (int): Size of each token block for reconstruction.
+
+    Returns:
+        List[str]: List of reconstructed text strings.
+    """
+    # Load tokenizer if string is provided
+    if isinstance(tokenizer, str):
+        tokenizer = cast(
+            PreTrainedTokenizerBase, AutoTokenizer.from_pretrained(tokenizer)
+        )
+
+    results: List[str] = []
+    _hash_id_to_tokens: Dict[int, np.ndarray] = {}
+
+    for hash_ids, input_len in zip(hash_ids_list, input_lengths):
+        # Verify constraint: len(hash_ids) * block_size <= input_len
+        if len(hash_ids) * block_size < input_len:
+            raise ValueError(
+                f"Constraint violation: len(hash_ids) * block_size ({len(hash_ids) * block_size}) > input_len ({input_len})"
+            )
+
+        token_arrays: List[np.ndarray] = []
+
+        for i, hash_id in enumerate(hash_ids):
+            # Determine the block size for this hash_id
+            remaining_tokens = input_len - sum(len(arr) for arr in token_arrays)
+            current_block_size = min(block_size, remaining_tokens)
+
+            if current_block_size <= 0:
+                break
+
+            # Check if hash_id already exists in global dict
+            if hash_id in _hash_id_to_tokens:
+                # Use existing array, but assert it matches current_block_size
+                existing_array = _hash_id_to_tokens[hash_id]
+                assert (
+                    len(existing_array) == current_block_size
+                ), f"Existing array length {len(existing_array)} does not match current block size {current_block_size}"
+                token_array = existing_array
+            else:
+                # Generate new random array by sampling words, tokenizing, and taking first tokens
+                sampled_words = np.random.choice(words, size=current_block_size)
+                sampled_text = " ".join(sampled_words)
+                tokens = tokenizer.encode(sampled_text, add_special_tokens=False)
+                token_array = np.array(tokens[:current_block_size], dtype=np.int32)
+                if getattr(tokenizer, "bos_token_id", None) is not None:
+                    token_array[0] = tokenizer.bos_token_id
+                _hash_id_to_tokens[hash_id] = token_array
+
+            token_arrays.append(token_array)
+
+        all_tokens = np.concatenate(token_arrays)
+
+        # Decode to text
+        text = tokenizer.decode(all_tokens, skip_special_tokens=False)
+        results.append(text)
+
+    return results
diff --git a/benchmarks/data_generator/logging_utils.py b/benchmarks/data_generator/logging_utils.py
diff --git a/benchmarks/data_generator/prefix_analyzer.py b/benchmarks/data_generator/prefix_analyzer.py
@@ -16,7 +16,7 @@
 import json
 from collections import Counter
 
-from data_generator.logging import calculate_and_print_statistics
+from data_generator.logging_utils import calculate_and_print_statistics
 
 
 class PrefixAnalyzer:
diff --git a/benchmarks/data_generator/synthesizer.py b/benchmarks/data_generator/synthesizer.py
@@ -36,8 +36,8 @@ def __init__(
         self,
         dataset_file: str,
         block_size: int = 512,
-        num_copies: int = 1,
         speedup_ratio: float = 1.0,
+        prefix_root_multiplier: int = 1,
         prefix_len_multiplier: float = 1.0,
         prompt_len_multiplier: float = 1.0,
     ):
@@ -74,7 +74,7 @@ def __init__(
             as the hash ids will be relabeled.
         """
         self.block_size = block_size
-        self.num_copies = num_copies
+        self.num_copies = prefix_root_multiplier
         self.speedup_ratio = float(speedup_ratio)
         self.prefix_len_multiplier = float(prefix_len_multiplier)
         self.prompt_len_multiplier = float(prompt_len_multiplier)
@@ -334,7 +334,7 @@ def main():
     import argparse
     from pathlib import Path
 
-    from data_generator.logging import calculate_and_print_statistics
+    from data_generator.logging_utils import calculate_and_print_statistics
 
     parser = argparse.ArgumentParser(description="Synthesize Mooncake-Esque dataset")
     parser.add_argument(
@@ -410,7 +410,7 @@ def main():
         block_size=args.block_size,
         speedup_ratio=args.speedup_ratio,
         prefix_len_multiplier=args.prefix_len_multiplier,
-        num_copies=args.prefix_root_multiplier,
+        prefix_root_multiplier=args.prefix_root_multiplier,
         prompt_len_multiplier=args.prompt_len_multiplier,
     )
 
diff --git a/benchmarks/data_generator/tests/test_hasher.py b/benchmarks/data_generator/tests/test_hasher.py
@@ -13,10 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+import random
+
 import pytest
-from data_generator.hasher import texts_to_hashes
+from data_generator.hasher import hashes_to_texts, texts_to_hashes
 from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers
-from transformers import PreTrainedTokenizerFast
+from transformers import AutoTokenizer, PreTrainedTokenizerFast
 
 
 @pytest.fixture(scope="module")
@@ -42,6 +45,11 @@ def dummy_tokenizer():
     )
 
 
+@pytest.fixture(scope="module")
+def deepseek_tokenizer():
+    return AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base")
+
+
 def test_texts_to_hashes_blocks(dummy_tokenizer):
     dum1 = "a b c d"
     dum2 = "e f g h"
@@ -52,3 +60,40 @@ def test_texts_to_hashes_blocks(dummy_tokenizer):
 
     result = texts_to_hashes(dummy_tokenizer, texts, block_size=4)
     assert result == expected, f"Expected {expected}, got {result}"
+
+
+def test_hashes_to_texts_with_deepseek(deepseek_tokenizer):
+    """Test hashes_to_texts with deepseek tokenizer using increasing hash IDs globally."""
+    # Test parameters
+    block_size = 64
+    num_entries = 100
+
+    # Generate test data
+    hash_ids_list = []
+    input_lengths = []
+    global_hash_id = 0
+
+    for _ in range(num_entries):
+        # Random input length between 1 and 20 times block_size
+        input_length = random.randint(block_size, 20 * block_size)
+        input_lengths.append(input_length)
+
+        # Calculate number of hash_ids needed (ceil div)
+        num_hash_ids = math.ceil(input_length / block_size)
+        hash_ids = list(range(global_hash_id, global_hash_id + num_hash_ids))
+        hash_ids_list.append(hash_ids)
+
+        global_hash_id += num_hash_ids
+
+    # Call hashes_to_texts
+    texts = hashes_to_texts(
+        deepseek_tokenizer, hash_ids_list, input_lengths, block_size
+    )
+
+    # Retokenize and verify input lengths are preserved
+    for i, (text, expected_length) in enumerate(zip(texts, input_lengths)):
+        tokens = deepseek_tokenizer(text, add_special_tokens=False)["input_ids"]
+        actual_length = len(tokens)
+        assert (
+            actual_length == expected_length
+        ), f"Entry {i}: expected length {expected_length}, got {actual_length}"