diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu index f832d9104..369030b37 100644 --- a/src/fastertransformer/kernels/layernorm_kernels.cu +++ b/src/fastertransformer/kernels/layernorm_kernels.cu @@ -78,13 +78,13 @@ __global__ void generalAddBiasResidualLayerNormOpt(T* normed_output, if (IS_OUTPUT) { T in_val; if (scale_input) { - in_val = input[index]; - } - else { in_val = cuda_cast(cuda_cast(reinterpret_cast(input)[index]) * scale_from_int); } - val = hadd2(val, input[index]); + else { + in_val = input[index]; + } + val = hadd2(val, in_val); } shmem[i] = val; output[index] = val;