@@ -31,7 +31,7 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
3131set (PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" )
3232
3333# Supported NVIDIA architectures.
34- set (CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0" )
34+ set (CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0 " )
3535
3636# Supported AMD GPU architectures.
3737set (HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201" )
@@ -312,7 +312,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
312312 # Only build Marlin kernels if we are building for at least some compatible archs.
313313 # Keep building Marlin for 9.0 as there are some group sizes and shapes that
314314 # are not supported by Machete yet.
315- cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS} " )
315+ cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0 " "${CUDA_ARCHS} " )
316316 if (MARLIN_ARCHS)
317317 set (MARLIN_SRCS
318318 "csrc/quantization/fp8/fp8_marlin.cu"
@@ -334,7 +334,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
334334
335335 # Only build AllSpark kernels if we are building for at least some compatible archs.
336336 cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS} " )
337- if (ALLSPARK_ARCHS)
337+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
338338 set (ALLSPARK_SRCS
339339 "csrc/quantization/gptq_allspark/allspark_repack.cu"
340340 "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu" )
@@ -345,46 +345,74 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
345345 message (STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS} " )
346346 else ()
347347 message (STATUS "Not building AllSpark kernels as no compatible archs found"
348- " in CUDA target architectures" )
348+ " in CUDA target architectures, or CUDA not >= 12.0 " )
349349 endif ()
350350
351+
352+ set (SCALED_MM_3X_ARCHS)
351353 # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
352- # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
353- cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS} " )
354- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS )
354+ # CUDA 12.0 or later
355+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a; " "${CUDA_ARCHS} " )
356+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS )
355357 set (SRCS
356- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x .cu"
358+ "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90 .cu"
357359 "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
358360 "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
359361 "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
360362 "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu" )
361363 set_gencode_flags_for_srcs(
362364 SRCS "${SRCS} "
363- CUDA_ARCHS "${SCALED_MM_3X_ARCHS } " )
365+ CUDA_ARCHS "${SCALED_MM_ARCHS } " )
364366 list (APPEND VLLM_EXT_SRC "${SRCS} " )
365- list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1" )
366- message (STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS} " )
367+ list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1" )
368+ # Let scaled_mm_c2x know it doesn't need to build these arches
369+ list (APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS} " )
370+ message (STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS} " )
367371 else ()
368- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS )
369- message (STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
372+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS )
373+ message (STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
370374 "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
371375 "later if you intend on running FP8 quantized models on "
372376 "Hopper." )
373377 else ()
374- message (STATUS "Not building scaled_mm_c3x as no compatible archs found "
378+ message (STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
375379 "in CUDA target architectures" )
376380 endif ()
381+ endif ()
377382
378- # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
379- # build any 3x kernels
380- set (SCALED_MM_3X_ARCHS)
383+ # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
384+ # CUDA 12.8 or later
385+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS} " )
386+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
387+ set (SRCS
388+ "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
389+ "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
390+ )
391+ set_gencode_flags_for_srcs(
392+ SRCS "${SRCS} "
393+ CUDA_ARCHS "${SCALED_MM_ARCHS} " )
394+ list (APPEND VLLM_EXT_SRC "${SRCS} " )
395+ list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1" )
396+ # Let scaled_mm_c2x know it doesn't need to build these arches
397+ list (APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS} " )
398+ message (STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS} " )
399+ else ()
400+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
401+ message (STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
402+ "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
403+ "later if you intend on running FP8 quantized models on "
404+ "Blackwell." )
405+ else ()
406+ message (STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
407+ "in CUDA target architectures" )
408+ endif ()
381409 endif ()
382410
383411 #
384412 # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
385413 # kernels for the remaining archs that are not already built for 3x.
386414 cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
387- "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS} " )
415+ "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0 " "${CUDA_ARCHS} " )
388416 # subtract out the archs that are already built for 3x
389417 list (REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS} )
390418 if (SCALED_MM_2X_ARCHS)
@@ -409,17 +437,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
409437 # 2:4 Sparse Kernels
410438
411439 # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
412- # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now ).
413- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS )
440+ # require CUDA 12.2 or later (and only work on Hopper and Blackwell ).
441+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS )
414442 set (SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu" )
415443 set_gencode_flags_for_srcs(
416444 SRCS "${SRCS} "
417- CUDA_ARCHS "${SCALED_MM_3X_ARCHS } " )
445+ CUDA_ARCHS "${SCALED_MM_ARCHS } " )
418446 list (APPEND VLLM_EXT_SRC "${SRCS} " )
419447 list (APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1" )
420- message (STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS } " )
448+ message (STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS } " )
421449 else ()
422- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS )
450+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS )
423451 message (STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
424452 "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
425453 "if you intend on running FP8 sparse quantized models on Hopper." )
@@ -434,8 +462,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
434462 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
435463 set (SRCS
436464 "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
437- "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
438- )
465+ "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" )
439466 set_gencode_flags_for_srcs(
440467 SRCS "${SRCS} "
441468 CUDA_ARCHS "${FP4_ARCHS} " )
@@ -534,6 +561,7 @@ define_gpu_extension_target(
534561 COMPILE_FLAGS ${VLLM_GPU_FLAGS}
535562 ARCHITECTURES ${VLLM_GPU_ARCHES}
536563 INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
564+ INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
537565 USE_SABI 3
538566 WITH_SOABI)
539567
@@ -557,7 +585,7 @@ set_gencode_flags_for_srcs(
557585 CUDA_ARCHS "${CUDA_ARCHS} " )
558586
559587if (VLLM_GPU_LANG STREQUAL "CUDA" )
560- cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS} " )
588+ cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0 " "${CUDA_ARCHS} " )
561589 if (MARLIN_MOE_ARCHS)
562590 set (MARLIN_MOE_SRC
563591 "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
0 commit comments