From 3133c75b60fd9ee7353458a76507ab145db2a5fe Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sat, 8 Feb 2025 15:51:23 -0500 Subject: [PATCH 1/3] refactor punica ops tests Signed-off-by: Varun Sundar Rabindranath --- tests/lora/test_punica_ops.py | 658 ++++++++++++++++++++++++ tests/lora/test_punica_ops_sizes.py | 401 --------------- tests/lora/test_punica_ops_variation.py | 317 ------------ tests/lora/utils.py | 41 +- 4 files changed, 692 insertions(+), 725 deletions(-) create mode 100644 tests/lora/test_punica_ops.py delete mode 100644 tests/lora/test_punica_ops_sizes.py delete mode 100644 tests/lora/test_punica_ops_variation.py diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py new file mode 100644 index 0000000000000..4a3132cbcbdc4 --- /dev/null +++ b/tests/lora/test_punica_ops.py @@ -0,0 +1,658 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +This script is mainly used to tests various hidden_sizes. We have collected the +hidden_sizes included in the LoRA models currently supported by vLLM. It tests +whether the corresponding Triton kernel can run normally when tensor parallelism +is set to [1, 2, 4, 8, 16, 32, 64]. +""" +from threading import Lock +from typing import List + +import pytest +import torch + +import vllm.lora.ops.triton_ops # noqa: F401 +from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, + bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +from vllm.platforms import current_platform + +from .utils import (PunicaTensors, assert_close, generate_data, + generate_data_for_expand_nslices, + generate_data_for_nslices) + + +# Utility shrink and expand operations used as reference implementations. +def sgmv_shrink_for_nslices( + nslices: int, inputs_tensor: torch.Tensor, + lora_weights_lst: List[torch.Tensor], out_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, + prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int, + num_tokens: int, scaling: float): + """ + Wrapper around sgmv_shrink that handles any nslices. + """ + for index in range(nslices): + sgmv_shrink( + inputs_tensor, + lora_weights_lst[index], + out_tensor[index], + b_seq_start_loc, + seq_len_tensor, + prompt_lora_mapping, + batches, + max_seq_length, + num_tokens, + scaling, + ) + + +def sgmv_expand_for_nslices(nslices: int, hidden_size: int, + inputs_tensor: torch.Tensor, + lora_weights_lst: List[torch.Tensor], + out_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + prompt_lora_mapping: torch.Tensor, batches: int, + max_seq_length: int, num_tokens: int, + add_inputs: bool) -> None: + """ + Wrapper around sgmv_expand that handles any nslices. + """ + if nslices == 1: + # Verify the torch's sgmv_expand op + sgmv_expand( + inputs_tensor[0], + lora_weights_lst[0], + out_tensor, + b_seq_start_loc, + seq_len_tensor, + prompt_lora_mapping, + batches, + max_seq_length, + num_tokens, + add_inputs=add_inputs, + ) + else: + slice_offset = 0 + for index in range(nslices): + lora_weights = lora_weights_lst[index] + sgmv_expand_slice( + inputs_tensor[index], + lora_weights, + out_tensor, + b_seq_start_loc, + seq_len_tensor, + prompt_lora_mapping, + batches, + max_seq_length, + num_tokens, + slice_offset, + hidden_size, + add_inputs=add_inputs, + ) + slice_offset += hidden_size + + +_dict_lock = Lock() + + +def check_sgmv_shrink(batches: int, num_loras: int, rank: int, + hidden_size: int, nslices: int, dtype: torch.dtype, + device: str, seq_length: int, scaling: float): + """ + Compare outputs of vllm.sgmv_shrink kernel against a reference + implementation. + """ + data: PunicaTensors = generate_data_for_nslices( + batches, + hidden_size, + num_loras, + rank, + seq_length, + nslices, + dtype, + "shrink", + device, + ) + max_seq_length, token_nums = data.meta() + + # Preventing cache error pointer. + with _dict_lock: + _LORA_A_PTR_DICT.clear() + torch.ops.vllm.sgmv_shrink( + data.inputs_tensor, + data.lora_weights, + data.our_out_tensor, + data.b_seq_start_loc, + data.seq_len_tensor, + data.prompt_lora_mapping, + batches, + max_seq_length, + token_nums, + scaling, + ) + + sgmv_shrink_for_nslices( + nslices, + data.inputs_tensor, + data.lora_weights, + data.ref_out_tensor, + data.b_seq_start_loc, + data.seq_len_tensor, + data.prompt_lora_mapping, + batches, + max_seq_length, + token_nums, + scaling, + ) + assert_close(data.our_out_tensor, data.ref_out_tensor) + + +def check_sgmv_expand(batches: int, num_loras: int, rank: int, + hidden_size: int, nslices: int, dtype: torch.dtype, + device: str, seq_length: int, add_inputs: bool): + """ + Compare outputs of vllm.sgmv_expand kernel against a reference + implementation. + """ + data: PunicaTensors = generate_data_for_nslices( + batches, + hidden_size, + num_loras, + rank, + seq_length, + nslices, + dtype, + "expand", + device, + ) + + max_seq_length, token_nums = data.meta() + + with _dict_lock: + _LORA_B_PTR_DICT.clear() + torch.ops.vllm.sgmv_expand( + data.inputs_tensor, + data.lora_weights, + data.our_out_tensor, + data.b_seq_start_loc, + data.seq_len_tensor, + data.prompt_lora_mapping, + batches, + max_seq_length, + token_nums, + offset_start=0, + add_inputs=add_inputs, + ) + + sgmv_expand_for_nslices(nslices, + hidden_size, + data.inputs_tensor, + data.lora_weights, + data.ref_out_tensor, + data.b_seq_start_loc, + data.seq_len_tensor, + data.prompt_lora_mapping, + batches, + max_seq_length, + token_nums, + add_inputs=add_inputs) + + assert_close(data.our_out_tensor, data.ref_out_tensor) + + +def check_bgmv_shrink(batches: int, num_loras: int, rank: int, + hidden_size: int, dtype: torch.dtype, device: str, + scaling: float): + """ + Compare vllm.bgmv_shrink against a reference implementation. + """ + seq_length = 1 + data: PunicaTensors = generate_data( + batches, + hidden_size, + num_loras, + rank, + seq_length, + dtype, + "shrink", + device, + ) + + torch.ops.vllm.bgmv_shrink( + data.inputs_tensor, + data.lora_weights, + data.our_out_tensor, + data.token_lora_mapping, + scaling, + ) + + bgmv_shrink( + data.inputs_tensor, + data.lora_weights, + data.ref_out_tensor, + data.token_lora_mapping, + scaling, + ) + + data.ref_out_tensor = data.ref_out_tensor.to(torch.float32) + assert_close(data.our_out_tensor, data.ref_out_tensor) + + +def check_bgmv_expand(batches: int, num_loras: int, rank: int, + hidden_size: int, dtype: torch.dtype, device: str, + add_inputs: bool): + """ + Compare vllm.bgmv_expand against a reference implementation. + """ + seq_length = 1 + data: PunicaTensors = generate_data( + batches, + hidden_size, + num_loras, + rank, + seq_length, + dtype, + "expand", + device, + ) + + torch.ops.vllm.bgmv_expand( + data.inputs_tensor, + data.lora_weights, + data.our_out_tensor, + data.token_lora_mapping, + add_inputs=add_inputs, + ) + bgmv_expand( + data.inputs_tensor, + data.lora_weights, + data.ref_out_tensor, + data.token_lora_mapping, + add_inputs=add_inputs, + ) + assert_close(data.our_out_tensor, data.ref_out_tensor) + + +def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int, + hidden_size: int, nslices: int, dtype: torch.dtype, + device: str, add_inputs: bool): + """ + Compare vllm.bgmv_expand_slice against a reference implementation. + """ + seq_length = 1 + data: PunicaTensors = generate_data_for_expand_nslices( + batches, + hidden_size, + num_loras, + rank, + seq_length, + dtype, + nslices, + device, + ) + + slice_offset = 0 + for index in range(nslices): + torch.ops.vllm.bgmv_expand_slice( + data.inputs_tensor, + data.lora_weights[index], + data.our_out_tensor, + data.token_lora_mapping, + slice_offset, + slice_size=hidden_size, + add_inputs=add_inputs, + ) + bgmv_expand_slice( + data.inputs_tensor, + data.lora_weights[index], + data.ref_out_tensor, + data.token_lora_mapping, + slice_offset, + slice_size=hidden_size, + add_inputs=add_inputs, + ) + + slice_offset += hidden_size + assert_close(data.our_out_tensor, data.ref_out_tensor) + + +# Tests +# We test the punica kernels along 2 verticals mainly. +# 1. Variations in hidden_dim size +# 2. Variations in all other parameters like (batch_size, max_rank, num_loras +# etc.) + +# We have collected the hidden_sizes included in the LoRA models +# currently supported by vLLM. It tests whether the corresponding Triton +# kernel can run normally when tensor parallelism is set to +# [1, 2, 4, 8, 16, 32, 64]. +HIDDEN_SIZES = [ + 128, + 256, + 512, + 896, + 1024, + 1152, + 1216, + 1280, + 1536, + 1664, + 2048, + 2240, + 2304, + 2368, + 2432, + 2560, + 2752, + 3072, + 3328, + 3456, + 3584, + 3712, + 4096, + 4480, + 4608, + 4736, + 4864, + 5120, + 5504, + 5632, + 5888, + 6144, + 6400, + 6848, + 6912, + 7168, + 7424, + 8192, + 8960, + 9216, + 9472, + 10240, + 11008, + 11264, + 13824, + 14336, + 14784, + 14848, + 15360, + 18944, + 22016, + 22528, + 24576, + 27392, + 27648, + 29568, + 29696, + 32000, + 32256, + 32512, + 32768, + 33024, + 36864, + 43264, + 49152, + 49408, + 60544, + 60672, + 64000, + 64256, + 102400, + 102656, + 128000, + 128256, +] +#The size of TP +divisibility = [1, 2, 8, 16, 64] + +all_hidden_size = [] +for div in divisibility: + for hidden_size in HIDDEN_SIZES: + all_hidden_size.append(hidden_size // div) + +HIDDEN_SIZES = list(set(all_hidden_size)) + +# Test params that focuses on hidden_size variation. +hs_test_params = { + "hidden_sizes": HIDDEN_SIZES, + "batches": [4], + "num_loras": [4], + "max_ranks": [32], +} + +# General tests params that tests for variations in all dimensions +# except hidden_size. +test_params = { + "hidden_sizes": [2049], + "batches": [1, 4, 16, 17, 32, 55], + "num_loras": [1, 3, 8, 32, 55, 128], + "max_ranks": [1, 4, 8, 16, 32, 64, 128, 256], +} + +DTYPES = [torch.float16, torch.bfloat16] +DEVICES = [f"cuda:{0}"] +SEED = [0] + + +@pytest.mark.parametrize("batches", test_params['batches']) +@pytest.mark.parametrize("num_loras", test_params['num_loras']) +@pytest.mark.parametrize("rank", test_params['max_ranks']) +@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes']) +@pytest.mark.parametrize("nslices", [1, 2, 3]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("op_type", ["shrink", "expand"]) +def test_punica_sgmv( + batches: int, + num_loras: int, + rank: int, + hidden_size: int, + nslices: int, + dtype: torch.dtype, + device: str, + seed: int, + op_type: str, +): + torch.set_default_device(device) + current_platform.seed_everything(seed) + + if op_type == "shrink": + check_sgmv_shrink(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + nslices=nslices, + dtype=dtype, + device=device, + seq_length=128, + scaling=0.5) + else: + check_sgmv_expand(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + nslices=nslices, + dtype=dtype, + device=device, + seq_length=128, + add_inputs=True) + + +@pytest.mark.parametrize("batches", hs_test_params['batches']) +@pytest.mark.parametrize("num_loras", hs_test_params['num_loras']) +@pytest.mark.parametrize("rank", hs_test_params['max_ranks']) +@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes']) +@pytest.mark.parametrize("nslices", [1, 2, 3]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("op_type", ["shrink", "expand"]) +def test_punica_sgmv_hidden_size( + batches: int, + num_loras: int, + rank: int, + hidden_size: int, + nslices: int, + dtype: torch.dtype, + device: str, + seed: int, + op_type: str, +): + torch.set_default_device(device) + current_platform.seed_everything(seed) + + if op_type == "shrink": + check_sgmv_shrink(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + nslices=nslices, + dtype=dtype, + device=device, + seq_length=128, + scaling=0.5) + else: + check_sgmv_expand(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + nslices=nslices, + dtype=dtype, + device=device, + seq_length=128, + add_inputs=True) + + +@pytest.mark.parametrize("batches", test_params['batches']) +@pytest.mark.parametrize("num_loras", test_params['num_loras']) +@pytest.mark.parametrize("rank", test_params['max_ranks']) +@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes']) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("op_type", ["shrink", "expand"]) +def test_punica_bgmv( + batches: int, + num_loras: int, + rank: int, + hidden_size: int, + dtype: torch.dtype, + device: str, + seed: int, + op_type: str, +): + torch.set_default_device(device) + current_platform.seed_everything(seed) + + if op_type == "shrink": + check_bgmv_shrink(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + dtype=dtype, + device=device, + scaling=0.5) + else: + check_bgmv_expand(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + dtype=dtype, + device=device, + add_inputs=True) + + +@pytest.mark.parametrize("batches", hs_test_params['batches']) +@pytest.mark.parametrize("num_loras", hs_test_params['num_loras']) +@pytest.mark.parametrize("rank", hs_test_params['max_ranks']) +@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes']) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("op_type", ["shrink", "expand"]) +def test_punica_bgmv_hidden_size( + batches: int, + num_loras: int, + rank: int, + hidden_size: int, + dtype: torch.dtype, + device: str, + seed: int, + op_type: str, +): + torch.set_default_device(device) + current_platform.seed_everything(seed) + + if op_type == "shrink": + check_bgmv_shrink(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + dtype=dtype, + device=device, + scaling=0.5) + else: + check_bgmv_expand(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + dtype=dtype, + device=device, + add_inputs=True) + + +@pytest.mark.parametrize("batches", test_params['batches']) +@pytest.mark.parametrize("num_loras", test_params['num_loras']) +@pytest.mark.parametrize("rank", test_params['max_ranks']) +@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes']) +@pytest.mark.parametrize("nslices", [2, 3]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEED) +def test_punica_bgmv_expand_nslices(batches: int, num_loras: int, rank: int, + hidden_size: int, nslices: int, + dtype: torch.dtype, device: str, + seed: int): + + torch.set_default_device(device) + current_platform.seed_everything(seed) + + check_bgmv_expand_slice(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + nslices=nslices, + dtype=dtype, + device=device, + add_inputs=True) + + +@pytest.mark.parametrize("batches", hs_test_params['batches']) +@pytest.mark.parametrize("num_loras", hs_test_params['num_loras']) +@pytest.mark.parametrize("rank", hs_test_params['max_ranks']) +@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes']) +@pytest.mark.parametrize("nslices", [2, 3]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEED) +def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int, + rank: int, hidden_size: int, + nslices: int, + dtype: torch.dtype, + device: str, seed: int): + + torch.set_default_device(device) + current_platform.seed_everything(seed) + + check_bgmv_expand_slice(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + nslices=nslices, + dtype=dtype, + device=device, + add_inputs=True) diff --git a/tests/lora/test_punica_ops_sizes.py b/tests/lora/test_punica_ops_sizes.py deleted file mode 100644 index ecd3bc4978f39..0000000000000 --- a/tests/lora/test_punica_ops_sizes.py +++ /dev/null @@ -1,401 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -""" -This script is mainly used to tests various hidden_sizes. We have collected the -hidden_sizes included in the LoRA models currently supported by vLLM. It tests -whether the corresponding Triton kernel can run normally when tensor parallelism -is set to [1, 2, 4, 8, 16, 32, 64]. -""" -from threading import Lock - -import pytest -import torch - -import vllm.lora.ops.triton_ops # noqa: F401 -from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, - bgmv_shrink, sgmv_expand, - sgmv_expand_slice, sgmv_shrink) -from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT -from vllm.platforms import current_platform - -from .utils import (assert_close, generate_data, - generate_data_for_expand_nslices, - generate_data_for_nslices) - -HIDDEN_SIZES = [ - 128, - 256, - 512, - 896, - 1024, - 1152, - 1216, - 1280, - 1536, - 1664, - 2048, - 2240, - 2304, - 2368, - 2432, - 2560, - 2752, - 3072, - 3328, - 3456, - 3584, - 3712, - 4096, - 4480, - 4608, - 4736, - 4864, - 5120, - 5504, - 5632, - 5888, - 6144, - 6400, - 6848, - 6912, - 7168, - 7424, - 8192, - 8960, - 9216, - 9472, - 10240, - 11008, - 11264, - 13824, - 14336, - 14784, - 14848, - 15360, - 18944, - 22016, - 22528, - 24576, - 27392, - 27648, - 29568, - 29696, - 32000, - 32256, - 32512, - 32768, - 33024, - 36864, - 43264, - 49152, - 49408, - 60544, - 60672, - 64000, - 64256, - 102400, - 102656, - 128000, - 128256, -] -#The size of TP -divisibility = [1, 2, 8, 16, 64] - -all_hidden_size = [] -for div in divisibility: - for hidden_size in HIDDEN_SIZES: - all_hidden_size.append(hidden_size // div) - -HIDDEN_SIZES = list(set(all_hidden_size)) - -BATCHES = [4] -NUM_LORA = [4] -DTYPES = [torch.float16, torch.bfloat16] -MAX_RANKS = [32] -SCALES = [0.5] -SEED = [0] -DEVICES = [f"cuda:{0}"] - -_dict_lock = Lock() - - -@pytest.mark.parametrize("batches", BATCHES) -@pytest.mark.parametrize("num_loras", NUM_LORA) -@pytest.mark.parametrize("rank", MAX_RANKS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("scaling", SCALES) -@pytest.mark.parametrize("nslices", [1, 2, 3]) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", ["shrink", "expand"]) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", DEVICES) -def test_punica_sgmv( - batches: int, - num_loras: int, - rank: int, - hidden_size: int, - scaling: float, - nslices: int, - dtype: torch.dtype, - op_type: str, - seed: int, - device: str, -): - torch.set_default_device(device) - current_platform.seed_everything(seed) - - seq_length = 128 - ( - inputs_tensor, - lora_weights_lst, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = generate_data_for_nslices( - batches, - hidden_size, - num_loras, - rank, - seq_length, - nslices, - dtype, - op_type, - device, - ) - max_seq_length = seq_len_tensor.max() - token_nums = seq_len_tensor.sum().item() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() - if op_type == "shrink": - # Preventing cache error pointer. - with _dict_lock: - _LORA_A_PTR_DICT.clear() - torch.ops.vllm.sgmv_shrink( - inputs_tensor, - lora_weights_lst, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - scaling, - ) - for index in range(nslices): - sgmv_shrink( - inputs_tensor, - lora_weights_lst[index], - ref_out_tensor[index], - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - scaling, - ) - - else: - with _dict_lock: - _LORA_B_PTR_DICT.clear() - torch.ops.vllm.sgmv_expand( - inputs_tensor, - lora_weights_lst, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - offset_start=0, - add_inputs=True, - ) - if nslices == 1: - # Verify the torch's sgmv_expand op - sgmv_expand( - inputs_tensor[0], - lora_weights_lst[0], - ref_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - add_inputs=True, - ) - else: - slice_offset = 0 - for index in range(nslices): - lora_weights = lora_weights_lst[index] - sgmv_expand_slice( - inputs_tensor[index], - lora_weights, - ref_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - slice_offset, - hidden_size, - add_inputs=True, - ) - slice_offset += hidden_size - - assert_close(our_out_tensor, ref_out_tensor) - - -@pytest.mark.parametrize("batches", BATCHES) -@pytest.mark.parametrize("num_loras", NUM_LORA) -@pytest.mark.parametrize("rank", MAX_RANKS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("scaling", SCALES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", ["shrink", "expand"]) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", DEVICES) -def test_punica_bgmv( - batches: int, - num_loras: int, - rank: int, - hidden_size: int, - scaling: float, - dtype: torch.dtype, - op_type: str, - seed: int, - device: str, -): - torch.set_default_device(device) - current_platform.seed_everything(seed) - - seq_length = 1 - ( - inputs_tensor, - lora_weights, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = generate_data( - batches, - hidden_size, - num_loras, - rank, - seq_length, - dtype, - op_type, - device, - ) - if op_type == "shrink": - torch.ops.vllm.bgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - scaling, - ) - - bgmv_shrink( - inputs_tensor, - lora_weights, - ref_out_tensor, - indices, - scaling, - ) - - else: - torch.ops.vllm.bgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - add_inputs=True, - ) - bgmv_expand( - inputs_tensor, - lora_weights, - ref_out_tensor, - indices, - add_inputs=True, - ) - - if op_type == "shrink": - ref_out_tensor = ref_out_tensor.to(torch.float32) - assert_close(our_out_tensor, ref_out_tensor) - - -@pytest.mark.parametrize("batches", BATCHES) -@pytest.mark.parametrize("num_loras", NUM_LORA) -@pytest.mark.parametrize("rank", MAX_RANKS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("nslices", [2, 3]) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", DEVICES) -def test_punica_bgmv_expand_nslices( - batches: int, - num_loras: int, - rank: int, - hidden_size: int, - nslices: int, - dtype: torch.dtype, - seed: int, - device: str, -): - torch.set_default_device(device) - current_platform.seed_everything(seed) - - seq_length = 1 - ( - inputs_tensor, - lora_weights_lst, - our_outputs, - ref_outputs, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = generate_data_for_expand_nslices( - batches, - hidden_size, - num_loras, - rank, - seq_length, - dtype, - nslices, - device, - ) - slice_offset = 0 - for index in range(nslices): - lora_weights = lora_weights_lst[index] - torch.ops.vllm.bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) - bgmv_expand_slice( - inputs_tensor, - lora_weights, - ref_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) - - slice_offset += hidden_size - assert_close(our_outputs, ref_outputs) diff --git a/tests/lora/test_punica_ops_variation.py b/tests/lora/test_punica_ops_variation.py deleted file mode 100644 index 6d1d3c9430f38..0000000000000 --- a/tests/lora/test_punica_ops_variation.py +++ /dev/null @@ -1,317 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -""" -This script is mainly used to test whether trtion kernels can run normally -under different conditions, including various batches, numbers of LoRA , and -maximum ranks. -""" -from threading import Lock - -import pytest -import torch - -# Enable custom op register -import vllm.lora.ops.triton_ops # noqa: F401 -from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, - bgmv_shrink, sgmv_expand, - sgmv_expand_slice, sgmv_shrink) -from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT -from vllm.platforms import current_platform - -from .utils import (assert_close, generate_data, - generate_data_for_expand_nslices, - generate_data_for_nslices) - -HIDDEN_SIZES = [2049] - -BATCHES = [1, 4, 16, 32] -NUM_LORA = [1, 8, 32, 128] -DTYPES = [torch.float16, torch.bfloat16] -MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256] -SCALES = [0.5] -SEED = [0] -DEVICES = [f"cuda:{0}"] - -_dict_lock = Lock() - - -@pytest.mark.parametrize("batches", BATCHES) -@pytest.mark.parametrize("num_loras", NUM_LORA) -@pytest.mark.parametrize("rank", MAX_RANKS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("scaling", SCALES) -@pytest.mark.parametrize("nslices", [1, 2, 3]) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", ["shrink", "expand"]) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", DEVICES) -def test_punica_sgmv( - batches: int, - num_loras: int, - rank: int, - hidden_size: int, - scaling: float, - nslices: int, - dtype: torch.dtype, - op_type: str, - seed: int, - device: str, -): - torch.set_default_device(device) - current_platform.seed_everything(seed) - - seq_length = 128 - ( - inputs_tensor, - lora_weights_lst, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = generate_data_for_nslices( - batches, - hidden_size, - num_loras, - rank, - seq_length, - nslices, - dtype, - op_type, - device, - ) - max_seq_length = seq_len_tensor.max() - token_nums = seq_len_tensor.sum().item() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() - if op_type == "shrink": - # Preventing cache error pointer. - with _dict_lock: - _LORA_A_PTR_DICT.clear() - torch.ops.vllm.sgmv_shrink( - inputs_tensor, - lora_weights_lst, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - scaling, - ) - for index in range(nslices): - sgmv_shrink( - inputs_tensor, - lora_weights_lst[index], - ref_out_tensor[index], - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - scaling, - ) - - else: - with _dict_lock: - _LORA_B_PTR_DICT.clear() - torch.ops.vllm.sgmv_expand( - inputs_tensor, - lora_weights_lst, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - offset_start=0, - add_inputs=True, - ) - slice_offset = 0 - if nslices == 1: - # Verify the torch's sgmv_expand op - sgmv_expand( - inputs_tensor[0], - lora_weights_lst[0], - ref_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - add_inputs=True, - ) - else: - for index in range(nslices): - lora_weights = lora_weights_lst[index] - sgmv_expand_slice( - inputs_tensor[index], - lora_weights, - ref_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - slice_offset, - hidden_size, - add_inputs=True, - ) - slice_offset += hidden_size - - assert_close(our_out_tensor, ref_out_tensor) - - -@pytest.mark.parametrize("batches", BATCHES) -@pytest.mark.parametrize("num_loras", NUM_LORA) -@pytest.mark.parametrize("rank", MAX_RANKS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("scaling", SCALES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", ["shrink", "expand"]) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", DEVICES) -def test_punica_bgmv( - batches: int, - num_loras: int, - rank: int, - hidden_size: int, - scaling: float, - dtype: torch.dtype, - op_type: str, - seed: int, - device: str, -): - torch.set_default_device(device) - current_platform.seed_everything(seed) - - seq_length = 1 - ( - inputs_tensor, - lora_weights, - our_out_tensor, - ref_out_tensor, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = generate_data( - batches, - hidden_size, - num_loras, - rank, - seq_length, - dtype, - op_type, - device, - ) - if op_type == "shrink": - torch.ops.vllm.bgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - scaling, - ) - - bgmv_shrink( - inputs_tensor, - lora_weights, - ref_out_tensor, - indices, - scaling, - ) - - else: - torch.ops.vllm.bgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - add_inputs=True, - ) - bgmv_expand( - inputs_tensor, - lora_weights, - ref_out_tensor, - indices, - add_inputs=True, - ) - - if op_type == "shrink": - ref_out_tensor = ref_out_tensor.to(torch.float32) - assert_close(our_out_tensor, ref_out_tensor) - - -@pytest.mark.parametrize("batches", BATCHES) -@pytest.mark.parametrize("num_loras", NUM_LORA) -@pytest.mark.parametrize("rank", MAX_RANKS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("nslices", [2, 3]) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", DEVICES) -def test_punica_bgmv_expand_nslices( - batches: int, - num_loras: int, - rank: int, - hidden_size: int, - nslices: int, - dtype: torch.dtype, - seed: int, - device: str, -): - torch.set_default_device(device) - current_platform.seed_everything(seed) - - seq_length = 1 - ( - inputs_tensor, - lora_weights_lst, - our_outputs, - ref_outputs, - b_seq_start_loc, - lora_indices_tensor, - seq_len_tensor, - indices, - ) = generate_data_for_expand_nslices( - batches, - hidden_size, - num_loras, - rank, - seq_length, - dtype, - nslices, - device, - ) - slice_offset = 0 - for index in range(nslices): - lora_weights = lora_weights_lst[index] - torch.ops.vllm.bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) - bgmv_expand_slice( - inputs_tensor, - lora_weights, - ref_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) - - slice_offset += hidden_size - assert_close(our_outputs, ref_outputs) diff --git a/tests/lora/utils.py b/tests/lora/utils.py index bda00e08190ef..1e163fbf97ce3 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Union import torch @@ -106,6 +107,31 @@ def assert_close(a, b): torch.testing.assert_close(a, b, rtol=rtol, atol=atol) +@dataclass +class PunicaTensors: + inputs_tensor: torch.Tensor + lora_weights: Union[torch.Tensor, List[torch.Tensor]] + our_out_tensor: torch.Tensor + ref_out_tensor: torch.Tensor + b_seq_start_loc: torch.Tensor + prompt_lora_mapping: torch.Tensor + seq_len_tensor: torch.Tensor + token_lora_mapping: torch.Tensor + + def meta(self) -> Tuple[int, int]: + """ + Infer max_seq_length and token_nums from the tensors + and return them. + """ + max_seq_length = self.seq_len_tensor.max() + token_nums = self.seq_len_tensor.sum().item() + if isinstance(max_seq_length, tuple): + max_seq_length = max_seq_length[0].item() + else: + max_seq_length = max_seq_length.item() + return max_seq_length, token_nums + + def generate_data( batches, hidden_size, @@ -115,7 +141,7 @@ def generate_data( dtype, op_type, device, -): +) -> PunicaTensors: seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches, )).to(device) b_seq_start_loc = torch.cumsum( @@ -164,7 +190,8 @@ def generate_data( indices[current_offset:current_offset + seq_len_tensor[b_id]].copy_(lora_index) current_offset += seq_len_tensor[b_id].item() - return ( + + return PunicaTensors( inputs_tensor, lora_weights, our_out_tensor, @@ -185,7 +212,7 @@ def generate_data_for_expand_nslices( dtype, nslices, device, -): +) -> PunicaTensors: seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches, )).to(device) b_seq_start_loc = torch.cumsum( @@ -222,7 +249,7 @@ def generate_data_for_expand_nslices( current_offset += seq_len_tensor[b_id].item() lora_indices_tensor = lora_indices_tensor.to(device) - return ( + return PunicaTensors( inputs_tensor, lora_weights_lst, our_out_tensor, @@ -244,7 +271,7 @@ def generate_data_for_nslices( dtype, op_type, device, -): +) -> PunicaTensors: seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches, )).to(device) b_seq_start_loc = torch.cumsum( @@ -302,7 +329,7 @@ def generate_data_for_nslices( current_offset += seq_len_tensor[b_id].item() lora_indices_tensor = lora_indices_tensor.to(device) - return ( + return PunicaTensors( inputs_tensor, lora_weights_lst, our_out_tensor, From cfb41eeec44e86ed971278d962c907e52f6b81a3 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 10 Feb 2025 09:09:46 -0500 Subject: [PATCH 2/3] remove unused comment Signed-off-by: Varun Sundar Rabindranath --- tests/lora/test_punica_ops.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index 4a3132cbcbdc4..bcc21028b6a70 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -1,10 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -""" -This script is mainly used to tests various hidden_sizes. We have collected the -hidden_sizes included in the LoRA models currently supported by vLLM. It tests -whether the corresponding Triton kernel can run normally when tensor parallelism -is set to [1, 2, 4, 8, 16, 32, 64]. -""" from threading import Lock from typing import List From 0d019e49b351e79e10fa4c778024c3b111e92add Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 10 Feb 2025 10:42:54 -0500 Subject: [PATCH 3/3] revert additional tests Signed-off-by: Varun Sundar Rabindranath --- tests/lora/test_punica_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index bcc21028b6a70..032e20470bcd3 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -421,8 +421,8 @@ def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int, # except hidden_size. test_params = { "hidden_sizes": [2049], - "batches": [1, 4, 16, 17, 32, 55], - "num_loras": [1, 3, 8, 32, 55, 128], + "batches": [1, 4, 16, 32], + "num_loras": [1, 8, 32, 128], "max_ranks": [1, 4, 8, 16, 32, 64, 128, 256], }