@@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
3434# Supported python versions. These versions will be searched in order, the
3535# first match will be selected. These should be kept in sync with setup.py.
3636#
37- set (PYTHON_SUPPORTED_VERSIONS "3.9" "3. 10" "3.11" "3.12" "3.13" )
37+ set (PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13" )
3838
3939# Supported AMD GPU architectures.
4040set (HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" )
@@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
269269 "csrc/sampler.cu"
270270 "csrc/cuda_view.cu"
271271 "csrc/quantization/gptq/q_gemm.cu"
272- "csrc/quantization/compressed_tensors/int8_quant_kernels .cu"
273- "csrc/quantization/fp8/common.cu"
272+ "csrc/quantization/w8a8/int8/scaled_quant .cu"
273+ "csrc/quantization/w8a8/ fp8/common.cu"
274274 "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
275275 "csrc/quantization/gguf/gguf_kernel.cu"
276276 "csrc/quantization/activation_kernels.cu"
@@ -314,12 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
314314 list (APPEND VLLM_EXT_SRC
315315 "csrc/quantization/awq/gemm_kernels.cu"
316316 "csrc/permute_cols.cu"
317- "csrc/quantization/cutlass_w8a8 /scaled_mm_entry.cu"
317+ "csrc/quantization/w8a8/cutlass /scaled_mm_entry.cu"
318318 "csrc/quantization/fp4/nvfp4_quant_entry.cu"
319319 "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
320320 "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
321321 "csrc/cutlass_extensions/common.cpp"
322- "csrc/quantization/fp8/per_token_group_quant.cu" )
322+ "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
323+ "csrc/quantization/w8a8/int8/per_token_group_quant.cu" )
323324
324325 set_gencode_flags_for_srcs(
325326 SRCS "${VLLM_EXT_SRC} "
@@ -423,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
423424 cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS} " )
424425 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
425426 set (SRCS
426- "csrc/quantization/cutlass_w8a8 /scaled_mm_c3x_sm90.cu"
427- "csrc/quantization/cutlass_w8a8 /c3x/scaled_mm_sm90_fp8.cu"
428- "csrc/quantization/cutlass_w8a8 /c3x/scaled_mm_sm90_int8.cu"
429- "csrc/quantization/cutlass_w8a8 /c3x/scaled_mm_azp_sm90_int8.cu"
430- "csrc/quantization/cutlass_w8a8 /c3x/scaled_mm_blockwise_sm90_fp8.cu" )
427+ "csrc/quantization/w8a8/cutlass /scaled_mm_c3x_sm90.cu"
428+ "csrc/quantization/w8a8/cutlass /c3x/scaled_mm_sm90_fp8.cu"
429+ "csrc/quantization/w8a8/cutlass /c3x/scaled_mm_sm90_int8.cu"
430+ "csrc/quantization/w8a8/cutlass /c3x/scaled_mm_azp_sm90_int8.cu"
431+ "csrc/quantization/w8a8/cutlass /c3x/scaled_mm_blockwise_sm90_fp8.cu" )
431432 set_gencode_flags_for_srcs(
432433 SRCS "${SRCS} "
433434 CUDA_ARCHS "${SCALED_MM_ARCHS} " )
@@ -458,9 +459,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
458459 endif ()
459460 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
460461 set (SRCS
461- "csrc/quantization/cutlass_w8a8 /scaled_mm_c3x_sm120.cu"
462- "csrc/quantization/cutlass_w8a8 /c3x/scaled_mm_sm120_fp8.cu"
463- "csrc/quantization/cutlass_w8a8 /c3x/scaled_mm_blockwise_sm120_fp8.cu"
462+ "csrc/quantization/w8a8/cutlass /scaled_mm_c3x_sm120.cu"
463+ "csrc/quantization/w8a8/cutlass /c3x/scaled_mm_sm120_fp8.cu"
464+ "csrc/quantization/w8a8/cutlass /c3x/scaled_mm_blockwise_sm120_fp8.cu"
464465 )
465466 set_gencode_flags_for_srcs(
466467 SRCS "${SRCS} "
@@ -492,9 +493,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
492493 endif ()
493494 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
494495 set (SRCS
495- "csrc/quantization/cutlass_w8a8 /scaled_mm_c3x_sm100.cu"
496- "csrc/quantization/cutlass_w8a8 /c3x/scaled_mm_sm100_fp8.cu"
497- "csrc/quantization/cutlass_w8a8 /c3x/scaled_mm_blockwise_sm100_fp8.cu"
496+ "csrc/quantization/w8a8/cutlass /scaled_mm_c3x_sm100.cu"
497+ "csrc/quantization/w8a8/cutlass /c3x/scaled_mm_sm100_fp8.cu"
498+ "csrc/quantization/w8a8/cutlass /c3x/scaled_mm_blockwise_sm100_fp8.cu"
498499 )
499500 set_gencode_flags_for_srcs(
500501 SRCS "${SRCS} "
@@ -525,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
525526 # subtract out the archs that are already built for 3x
526527 list (REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS} )
527528 if (SCALED_MM_2X_ARCHS)
528- set (SRCS "csrc/quantization/cutlass_w8a8 /scaled_mm_c2x.cu" )
529+ set (SRCS "csrc/quantization/w8a8/cutlass /scaled_mm_c2x.cu" )
529530 set_gencode_flags_for_srcs(
530531 SRCS "${SRCS} "
531532 CUDA_ARCHS "${SCALED_MM_2X_ARCHS} " )
@@ -648,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
648649 # if it's possible to compile MoE kernels that use its output.
649650 cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS} " )
650651 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
651- set (SRCS "csrc/quantization/cutlass_w8a8 /moe/grouped_mm_c3x_sm90.cu" )
652+ set (SRCS "csrc/quantization/w8a8/cutlass /moe/grouped_mm_c3x_sm90.cu" )
652653 set_gencode_flags_for_srcs(
653654 SRCS "${SRCS} "
654655 CUDA_ARCHS "${SCALED_MM_ARCHS} " )
@@ -672,7 +673,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
672673 cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS} " )
673674 endif ()
674675 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
675- set (SRCS "csrc/quantization/cutlass_w8a8 /moe/grouped_mm_c3x_sm100.cu" )
676+ set (SRCS "csrc/quantization/w8a8/cutlass /moe/grouped_mm_c3x_sm100.cu" )
676677 set_gencode_flags_for_srcs(
677678 SRCS "${SRCS} "
678679 CUDA_ARCHS "${SCALED_MM_ARCHS} " )
@@ -697,7 +698,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
697698 cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS} " )
698699 endif ()
699700 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
700- set (SRCS "csrc/quantization/cutlass_w8a8 /moe/moe_data.cu" )
701+ set (SRCS "csrc/quantization/w8a8/cutlass /moe/moe_data.cu" )
701702 set_gencode_flags_for_srcs(
702703 SRCS "${SRCS} "
703704 CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS} " )
@@ -720,7 +721,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
720721 cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS} " )
721722 endif ()
722723 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
723- set (SRCS "csrc/quantization/cutlass_w8a8 /moe/blockwise_scaled_group_mm_sm100.cu" )
724+ set (SRCS "csrc/quantization/w8a8/cutlass /moe/blockwise_scaled_group_mm_sm100.cu" )
724725 set_gencode_flags_for_srcs(
725726 SRCS "${SRCS} "
726727 CUDA_ARCHS "${SCALED_MM_ARCHS} " )
0 commit comments