Merge branch 'main' of https://github.com/NVIDIA/FasterTransformer in…

…to main
NVIDIA · Mar 6, 2023 · 303e052 · 303e052
2 parents f7a4418 + 8bf96eb
commit 303e052
Show file tree

Hide file tree

Showing 4 changed files with 169 additions and 169 deletions.
diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu
@@ -78,13 +78,13 @@ __global__ void generalAddBiasResidualLayerNormOpt(T* normed_output,
         if (IS_OUTPUT) {
             T in_val;
             if (scale_input) {
-                in_val = input[index];
-            }
-            else {
                 in_val = cuda_cast<T>(cuda_cast<Float_Packed_T>(reinterpret_cast<const Int32_Packed_T*>(input)[index])
                                       * scale_from_int);
             }
-            val = hadd2(val, input[index]);
+            else {
+                in_val = input[index];
+            }
+            val = hadd2(val, in_val);
         }
         shmem[i]      = val;
         output[index] = val;

diff --git a/src/fastertransformer/layers/attention_layers_fp8/SelfAttentionFP8Layer.cc b/src/fastertransformer/layers/attention_layers_fp8/SelfAttentionFP8Layer.cc
@@ -139,7 +139,7 @@ void SelfAttentionFP8Layer<T1, T2>::forward(TensorMap*                        ou
             float attn_scale_2 =
                 attention_weights->query_weight.output_h_scale[0]
                 * attention_weights->attention_output_weight.input_h_scale_inv[0];  // v and output scale
-            dispatcher_fp8->setScaleList(attn_scale_1, 0.0f, attn_scale_2);
+            dispatcher_fp8->setScaleList(attn_scale_1, 1.0f, attn_scale_2);
             // For example, if a query is like
             // [[S_0, P_0], [S_1, P_1]], where S_i is real tokens and P_i is padded tokens.
             // In zero pad case, we remove the padding and the input looks like [S_0, S_1].