Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
[Kernel] Fix CUTLASS 3.x custom broadcast load epilogue (vllm-project…
Browse files Browse the repository at this point in the history
  • Loading branch information
tlrmchlsmth authored and robertgshaw2-neuralmagic committed Jun 23, 2024
1 parent cab4a5d commit 923d05a
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast {

CUTLASS_DEVICE void
begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
if (params.ptr_row == nullptr) {
if (!params.row_broadcast) {
return;
}

Expand Down
4 changes: 1 addition & 3 deletions vllm/model_executor/layers/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,9 +257,7 @@ def apply(self,
# If dynamic, layer.input_scale is None and x_scale computed from x.
# If static, layer.input_scale is scalar and x_scale is input_scale.

# Temporarily disable CUTLASS kernels due to an illegal memory access
#if bias is None and self.cutlass_fp8_supported:
if False:
if bias is None and self.cutlass_fp8_supported:
qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)

# Fused GEMM_DQ
Expand Down

0 comments on commit 923d05a

Please sign in to comment.