Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmarks/benchmark_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,8 +349,9 @@ def sample(
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# To avoid uncontrolled change of the prompt length,
# the encoded sequence is truncated before being decode again.
total_input_len = prefix_len + int(input_lens[i])
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
: input_lens[i]
:total_input_len
]
prompt = tokenizer.decode(re_encoded_sequence)
total_input_len = len(re_encoded_sequence)
Expand Down
20 changes: 13 additions & 7 deletions examples/offline_inference/spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,20 @@ def parse_args():
parser.add_argument("--top-k", type=int, default=-1)
parser.add_argument("--print-output", action="store_true")
parser.add_argument("--output-len", type=int, default=256)
parser.add_argument("--model-dir", type=str, default=None)
parser.add_argument("--eagle-dir", type=str, default=None)
parser.add_argument("--max-model-len", type=int, default=2048)
return parser.parse_args()


def main():
args = parse_args()
args.endpoint_type = "openai-chat"

model_dir = "meta-llama/Llama-3.1-8B-Instruct"
model_dir = args.model_dir
if args.model_dir is None:
model_dir = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
max_model_len = 2048

prompts = get_samples(args, tokenizer)
# add_special_tokens is False to avoid adding bos twice when using chat templates
Expand All @@ -57,24 +61,26 @@ def main():
]

if args.method == "eagle" or args.method == "eagle3":
if args.method == "eagle":
eagle_dir = args.eagle_dir
if args.method == "eagle" and eagle_dir is None:
eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
elif args.method == "eagle3":

elif args.method == "eagle3" and eagle_dir is None:
eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
speculative_config = {
"method": args.method,
"model": eagle_dir,
"num_speculative_tokens": args.num_spec_tokens,
"draft_tensor_parallel_size": args.draft_tp,
"max_model_len": max_model_len,
"max_model_len": args.max_model_len,
}
elif args.method == "ngram":
speculative_config = {
"method": "ngram",
"num_speculative_tokens": args.num_spec_tokens,
"prompt_lookup_max": args.prompt_lookup_max,
"prompt_lookup_min": args.prompt_lookup_min,
"max_model_len": max_model_len,
"max_model_len": args.max_model_len,
}
else:
raise ValueError(f"unknown method: {args.method}")
Expand All @@ -86,7 +92,7 @@ def main():
enable_chunked_prefill=args.enable_chunked_prefill,
max_num_batched_tokens=args.max_num_batched_tokens,
enforce_eager=args.enforce_eager,
max_model_len=max_model_len,
max_model_len=args.max_model_len,
max_num_seqs=args.max_num_seqs,
gpu_memory_utilization=0.8,
speculative_config=speculative_config,
Expand Down
10 changes: 7 additions & 3 deletions vllm/benchmarks/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,8 @@ def __init__(
**kwargs,
) -> None:
super().__init__(**kwargs)
random.seed(self.random_seed)
np.random.seed(self.random_seed)

def sample(
self,
Expand Down Expand Up @@ -376,10 +378,11 @@ def sample(
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# To avoid uncontrolled change of the prompt length,
# the encoded sequence is truncated before being decode again.
total_input_len = prefix_len + int(input_lens[i])
re_encoded_sequence = tokenizer.encode(
prompt, add_special_tokens=False)[:input_lens[i]]
prompt, add_special_tokens=False)[:total_input_len]
prompt = tokenizer.decode(re_encoded_sequence)
total_input_len = prefix_len + int(input_lens[i])
total_input_len = len(re_encoded_sequence)
requests.append(
SampleRequest(
prompt=prompt,
Expand Down Expand Up @@ -692,7 +695,8 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
dataset_path=args.dataset_path).
sample(tokenizer=tokenizer, num_requests=args.num_prompts),
"random":
lambda: RandomDataset(dataset_path=args.dataset_path).sample(
lambda: RandomDataset(random_seed=args.seed,
dataset_path=args.dataset_path).sample(
tokenizer=tokenizer,
num_requests=args.num_prompts,
prefix_len=args.random_prefix_len,
Expand Down
6 changes: 6 additions & 0 deletions vllm/benchmarks/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="The label (prefix) of the benchmark results. If not specified, "
"the endpoint type will be used as the label.",
)
parser.add_argument(
"--backend",
type=str,
default="vllm",
choices=list(ASYNC_REQUEST_FUNCS.keys()),
)
parser.add_argument(
"--base-url",
type=str,
Expand Down