Skip to content

Commit 1f59718

Browse files
feat: generate random texts from hashes using lorem ipsum (#1458)
Signed-off-by: Yan Ru Pei <yanrpei@gmail.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent eec345a commit 1f59718

File tree

8 files changed

+229
-13
lines changed

8 files changed

+229
-13
lines changed

benchmarks/README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,15 @@ This directory contains benchmarking scripts and tools for performance evaluatio
1919

2020
## Installation
2121

22-
To install the necessary dependencies locally, run:
22+
This is already included as part of the dynamo vllm image. To install locally or standalone, run:
2323

2424
```bash
2525
pip install -e .
2626
```
2727

2828
Currently, this will install lightweight tools for:
2929
- Analyzing prefix-structured data (`datagen analyze`)
30-
- Synthesizing structured data customizable for testing purposes (`datagen synthesize`)
30+
- Synthesizing structured data customizable for testing purposes (`datagen synthesize`)
31+
Detailed information are provided in the `data_generator` directory.
32+
33+
The benchmarking scripts for the core dynamo components are to come soon (e.g. routing, disagg, Planner).

benchmarks/data_generator/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License. -->
1515

16+
## Quickstart
17+
18+
`example.py` contains an example workflow guiding through synthesizing new requests based on the mooncake trace file. It touches on the core components of this directory.
19+
1620
## Trace File Format
1721

1822
The following tools help analyze and synthesize new data based on the [mooncake trace file format](https://github.com/kvcache-ai/Mooncake/blob/d21da178bae8db9651cf18a76824c084145fc725/mooncake_trace.jsonl). In this format, the first few lines would look like this, for example:
@@ -26,10 +30,14 @@ The following tools help analyze and synthesize new data based on the [mooncake
2630

2731
**Hash ID Generation:** Each new hash ID is the next consecutive integer after the last one used. Two `hash_ids` sharing the same integers represents the prefix overlap. To generate these increasing hash IDs from a list of texts, we provide the `texts_to_hashes` function in `hasher.py`.
2832

33+
> [!note]The `hashes_to_texts` function can then be used to generate back random texts from these hash IDs sampling from Lorem Ipsum.
34+
2935
**Timestamp:** The arrival time (in milliseconds) of the request since the first request, which can be the same for multiple requests arriving simultaneously.
3036

3137
**Block Size and Hash IDs:** In this example, the `block_size` (the page size of the KV cache) is assumed to be 512. The length of the `hash_ids` array equals `input_length // block_size`.
3238

39+
A general workflow can use `texts_to_hashes` to convert texts to hashes, then use `datagen synthesize` to generate new hashes, then use `hashes_to_texts` to convert them back to random texts.
40+
3341
## Prefix Analyzer
3442

3543
The Prefix Analyzer provides statistics on a trace file, such as Input Sequence Length (ISL), Output Sequence Length (OSL), and theoretical cache hit rate.
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import json
17+
import tempfile
18+
19+
import requests
20+
from data_generator.hasher import hashes_to_texts
21+
from data_generator.synthesizer import Synthesizer
22+
23+
# download the mooncake trace file
24+
mooncake_trace_permalink = "https://raw.githubusercontent.com/kvcache-ai/Mooncake/f09c501b2a5d73e4d60cdeb612d7d0d54e1ec228/mooncake_trace.jsonl"
25+
with tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl", mode="w+b") as tmp_file:
26+
response = requests.get(mooncake_trace_permalink)
27+
tmp_file.write(response.content)
28+
trace_file = tmp_file.name
29+
30+
31+
# create the synthesizer
32+
synthesizer = Synthesizer(
33+
dataset_file=trace_file,
34+
block_size=512, # it has to be this, as determined by the mooncake trace
35+
speedup_ratio=2, # the requests will be sent twice as fast
36+
prefix_root_multiplier=4, # will generate 4 separate prefix roots
37+
prefix_len_multiplier=4, # prefix lengths 4 times as long
38+
prompt_len_multiplier=0.5, # shorten prompt lengths to make prefix ratio even larger
39+
)
40+
41+
# generate requests
42+
requests_synth = synthesizer.synthesize_requests(
43+
num_requests=100,
44+
input_len_filter=(
45+
16384 - 1000
46+
), # this is what most model defaults to, leaving some room for outpputs
47+
)
48+
49+
# convert the hashes into random texts (lorem ipsum), respecting the prefix structure
50+
tokenizer = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
51+
input_texts = hashes_to_texts(
52+
tokenizer=tokenizer,
53+
hash_ids_list=[req["hash_ids"] for req in requests_synth],
54+
input_lengths=[req["input_length"] for req in requests_synth],
55+
block_size=512,
56+
)
57+
58+
for i, req in enumerate(requests_synth):
59+
req["input_text"] = input_texts[i]
60+
del req["hash_ids"]
61+
62+
output_file = "synthesized_requests.jsonl"
63+
with open("synthesized_requests.jsonl", "w") as f:
64+
for req in requests_synth:
65+
f.write(json.dumps(req) + "\n")
66+
67+
print(f"Saved {len(requests_synth)} requests to {output_file}")

benchmarks/data_generator/hasher.py

Lines changed: 97 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,47 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
from typing import Dict, List
16+
import re
17+
from typing import Dict, List, Union, cast
1718

18-
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
19+
import numpy as np
20+
from transformers import AutoTokenizer, PreTrainedTokenizerBase
21+
22+
lorem_text = (
23+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor "
24+
"incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis "
25+
"nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. "
26+
"Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore "
27+
"eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt "
28+
"in culpa qui officia deserunt mollit anim id est laborum."
29+
)
30+
words = np.array(list(set(re.findall(r"\b[a-zA-Z]+\b", lorem_text))))
1931

2032

2133
def texts_to_hashes(
22-
tokenizer: PreTrainedTokenizerBase, texts: List[str], block_size: int = 512
34+
tokenizer: Union[str, PreTrainedTokenizerBase],
35+
texts: List[str],
36+
block_size: int = 512,
2337
) -> List[List[int]]:
2438
"""
2539
Tokenizes a list of strings (without special tokens), splits tokens into blocks,
2640
computes rolling hashes, and returns a list of lists of integer-mapped rolling hashes
2741
for each input string.
2842
2943
Args:
30-
tokenizer: Tokenizer object with a .encode method.
44+
tokenizer: Tokenizer object with a .encode method or string name to load from HuggingFace.
3145
texts (List[str]): List of input strings.
3246
block_size (int): Size of each token block for hashing.
3347
3448
Returns:
3549
List[List[int]]: List of lists of integer-mapped rolling hashes for each block of each input string.
3650
"""
51+
# Load tokenizer if string is provided
52+
if isinstance(tokenizer, str):
53+
tokenizer = cast(
54+
PreTrainedTokenizerBase, AutoTokenizer.from_pretrained(tokenizer)
55+
)
56+
3757
# Batch tokenize for efficiency
3858
batch_encoding = tokenizer(
3959
texts,
@@ -71,3 +91,76 @@ def texts_to_hashes(
7191
results.append(hashes)
7292

7393
return results
94+
95+
96+
def hashes_to_texts(
97+
tokenizer: Union[str, PreTrainedTokenizerBase],
98+
hash_ids_list: List[List[int]],
99+
input_lengths: List[int],
100+
block_size: int = 512,
101+
) -> List[str]:
102+
"""
103+
Converts a list of hash ID sequences back to text strings using a global token mapping.
104+
105+
Args:
106+
tokenizer: Tokenizer object with a .decode method or string name to load from HuggingFace.
107+
hash_ids_list (List[List[int]]): List of hash ID sequences for each input.
108+
input_lengths (List[int]): Target input lengths for each sequence.
109+
block_size (int): Size of each token block for reconstruction.
110+
111+
Returns:
112+
List[str]: List of reconstructed text strings.
113+
"""
114+
# Load tokenizer if string is provided
115+
if isinstance(tokenizer, str):
116+
tokenizer = cast(
117+
PreTrainedTokenizerBase, AutoTokenizer.from_pretrained(tokenizer)
118+
)
119+
120+
results: List[str] = []
121+
_hash_id_to_tokens: Dict[int, np.ndarray] = {}
122+
123+
for hash_ids, input_len in zip(hash_ids_list, input_lengths):
124+
# Verify constraint: len(hash_ids) * block_size <= input_len
125+
if len(hash_ids) * block_size < input_len:
126+
raise ValueError(
127+
f"Constraint violation: len(hash_ids) * block_size ({len(hash_ids) * block_size}) > input_len ({input_len})"
128+
)
129+
130+
token_arrays: List[np.ndarray] = []
131+
132+
for i, hash_id in enumerate(hash_ids):
133+
# Determine the block size for this hash_id
134+
remaining_tokens = input_len - sum(len(arr) for arr in token_arrays)
135+
current_block_size = min(block_size, remaining_tokens)
136+
137+
if current_block_size <= 0:
138+
break
139+
140+
# Check if hash_id already exists in global dict
141+
if hash_id in _hash_id_to_tokens:
142+
# Use existing array, but assert it matches current_block_size
143+
existing_array = _hash_id_to_tokens[hash_id]
144+
assert (
145+
len(existing_array) == current_block_size
146+
), f"Existing array length {len(existing_array)} does not match current block size {current_block_size}"
147+
token_array = existing_array
148+
else:
149+
# Generate new random array by sampling words, tokenizing, and taking first tokens
150+
sampled_words = np.random.choice(words, size=current_block_size)
151+
sampled_text = " ".join(sampled_words)
152+
tokens = tokenizer.encode(sampled_text, add_special_tokens=False)
153+
token_array = np.array(tokens[:current_block_size], dtype=np.int32)
154+
if getattr(tokenizer, "bos_token_id", None) is not None:
155+
token_array[0] = tokenizer.bos_token_id
156+
_hash_id_to_tokens[hash_id] = token_array
157+
158+
token_arrays.append(token_array)
159+
160+
all_tokens = np.concatenate(token_arrays)
161+
162+
# Decode to text
163+
text = tokenizer.decode(all_tokens, skip_special_tokens=False)
164+
results.append(text)
165+
166+
return results
File renamed without changes.

benchmarks/data_generator/prefix_analyzer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import json
1717
from collections import Counter
1818

19-
from data_generator.logging import calculate_and_print_statistics
19+
from data_generator.logging_utils import calculate_and_print_statistics
2020

2121

2222
class PrefixAnalyzer:

benchmarks/data_generator/synthesizer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def __init__(
3636
self,
3737
dataset_file: str,
3838
block_size: int = 512,
39-
num_copies: int = 1,
4039
speedup_ratio: float = 1.0,
40+
prefix_root_multiplier: int = 1,
4141
prefix_len_multiplier: float = 1.0,
4242
prompt_len_multiplier: float = 1.0,
4343
):
@@ -74,7 +74,7 @@ def __init__(
7474
as the hash ids will be relabeled.
7575
"""
7676
self.block_size = block_size
77-
self.num_copies = num_copies
77+
self.num_copies = prefix_root_multiplier
7878
self.speedup_ratio = float(speedup_ratio)
7979
self.prefix_len_multiplier = float(prefix_len_multiplier)
8080
self.prompt_len_multiplier = float(prompt_len_multiplier)
@@ -334,7 +334,7 @@ def main():
334334
import argparse
335335
from pathlib import Path
336336

337-
from data_generator.logging import calculate_and_print_statistics
337+
from data_generator.logging_utils import calculate_and_print_statistics
338338

339339
parser = argparse.ArgumentParser(description="Synthesize Mooncake-Esque dataset")
340340
parser.add_argument(
@@ -410,7 +410,7 @@ def main():
410410
block_size=args.block_size,
411411
speedup_ratio=args.speedup_ratio,
412412
prefix_len_multiplier=args.prefix_len_multiplier,
413-
num_copies=args.prefix_root_multiplier,
413+
prefix_root_multiplier=args.prefix_root_multiplier,
414414
prompt_len_multiplier=args.prompt_len_multiplier,
415415
)
416416

benchmarks/data_generator/tests/test_hasher.py

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,13 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import math
17+
import random
18+
1619
import pytest
17-
from data_generator.hasher import texts_to_hashes
20+
from data_generator.hasher import hashes_to_texts, texts_to_hashes
1821
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers
19-
from transformers import PreTrainedTokenizerFast
22+
from transformers import AutoTokenizer, PreTrainedTokenizerFast
2023

2124

2225
@pytest.fixture(scope="module")
@@ -42,6 +45,11 @@ def dummy_tokenizer():
4245
)
4346

4447

48+
@pytest.fixture(scope="module")
49+
def deepseek_tokenizer():
50+
return AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base")
51+
52+
4553
def test_texts_to_hashes_blocks(dummy_tokenizer):
4654
dum1 = "a b c d"
4755
dum2 = "e f g h"
@@ -52,3 +60,40 @@ def test_texts_to_hashes_blocks(dummy_tokenizer):
5260

5361
result = texts_to_hashes(dummy_tokenizer, texts, block_size=4)
5462
assert result == expected, f"Expected {expected}, got {result}"
63+
64+
65+
def test_hashes_to_texts_with_deepseek(deepseek_tokenizer):
66+
"""Test hashes_to_texts with deepseek tokenizer using increasing hash IDs globally."""
67+
# Test parameters
68+
block_size = 64
69+
num_entries = 100
70+
71+
# Generate test data
72+
hash_ids_list = []
73+
input_lengths = []
74+
global_hash_id = 0
75+
76+
for _ in range(num_entries):
77+
# Random input length between 1 and 20 times block_size
78+
input_length = random.randint(block_size, 20 * block_size)
79+
input_lengths.append(input_length)
80+
81+
# Calculate number of hash_ids needed (ceil div)
82+
num_hash_ids = math.ceil(input_length / block_size)
83+
hash_ids = list(range(global_hash_id, global_hash_id + num_hash_ids))
84+
hash_ids_list.append(hash_ids)
85+
86+
global_hash_id += num_hash_ids
87+
88+
# Call hashes_to_texts
89+
texts = hashes_to_texts(
90+
deepseek_tokenizer, hash_ids_list, input_lengths, block_size
91+
)
92+
93+
# Retokenize and verify input lengths are preserved
94+
for i, (text, expected_length) in enumerate(zip(texts, input_lengths)):
95+
tokens = deepseek_tokenizer(text, add_special_tokens=False)["input_ids"]
96+
actual_length = len(tokens)
97+
assert (
98+
actual_length == expected_length
99+
), f"Entry {i}: expected length {expected_length}, got {actual_length}"

0 commit comments

Comments
 (0)