Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions tests/kernels/quantization/test_block_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,16 @@

# Test configurations
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
NUM_TOKENS = [7, 83, 2048]
NUM_TOKENS = [7, 2050]
D = [512, 4096, 5120, 13824]
GROUP_SIZE = [64, 128, 256, 512]
M = [1, 7, 8, 83, 84, 512, 2048, 4096]
N = [128, 512, 1024, 4096, 7168, 7748, 13824]
K = [256, 4096, 5120, 3884, 13824, 16384]
GROUP_SIZE = [64, 128, 512]
M = [1, 7, 8, 83, 84, 4096]
N = [128, 512, 7168, 7748, 13824]
K = [256, 3884, 4096, 13824, 16384]
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
# and its hidden size is 7168.
M_moe = [1, 2, 7, 83, 128, 512, 2048]
M_moe_dg = [128, 192, 512, 1335, 2048]
M_moe = [1, 2, 7, 83, 128, 2048]
M_moe_dg = [128, 192, 1335, 2048]
N_moe = [128, 256, 1024, 4608] # [13824]
K_moe = [256, 512, 7168] # [13824]
BLOCK_SIZE = [[128, 128]]
Expand Down
4 changes: 2 additions & 2 deletions tests/kernels/quantization/test_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ def get_gguf_MoE_tensors(
return GGUFReader(sample_file).tensors


DTYPES = [torch.half, torch.bfloat16, torch.float32]
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
# Hidden_size for testing, must match the sample file in HF repo,
# we have `hidden_size = 256, 1024` for test in HF repo currently.
HIDDEN_SIZES = [256, 1024]
NUM_TOKENS = [7, 83, 128, 2048] # Arbitrary values for testing
NUM_TOKENS = [7, 2050] # Arbitrary values for testing
SEEDS = [0]
QUANT_TYPES = [
# i-matrix
Expand Down
24 changes: 8 additions & 16 deletions tests/kernels/quantization/test_triton_scaled_mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@

device = "cuda"

triton_scaled_mm_module = importlib.import_module(
"vllm.model_executor.layers.quantization.compressed_tensors."
"triton_scaled_mm")
triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm

def scaled_mm_torch(a: torch.Tensor,

def torch_scaled_mm(a: torch.Tensor,
b: torch.Tensor,
scale_a: torch.Tensor,
scale_b: torch.Tensor,
Expand Down Expand Up @@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
if use_bias:
bias = torch.rand((N, ), device=device, dtype=out_dtype)

triton_scaled_mm_module = importlib.import_module(
"vllm.model_executor.layers.quantization.compressed_tensors."
"triton_scaled_mm")
triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm

c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)

a_cpu = a.cpu()
b_cpu = b.cpu()
scale_a_cpu = scale_a.cpu()
scale_b_cpu = scale_b.cpu()
bias_cpu = None if bias is None else bias.cpu()

c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu,
out_dtype, bias_cpu)
c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)

c_check_cpu = c_check.cpu()
torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1)
torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1)
Loading