mosaicml · codestar12 · Mar 2, 2023 · Mar 2, 2023 · bmosaicml · Mar 2, 2023
diff --git a/examples/llm/src/models/layers/attention.py b/examples/llm/src/models/layers/attention.py
@@ -122,16 +122,18 @@ def __init__(self, cfg: DictConfig, device: Optional[str] = None):
 
         self.clip_qkv = cfg.get('attn_clip_qkv')
         self.attn_qk_ln = cfg.get('attn_qk_ln')
+        self.softmax_scale = cfg.get('softmax_scale')
         self.d_model = cfg.d_model
         self.n_heads = cfg.n_heads
 
-        if self.attn_qk_ln or self.clip_qkv:
+        if self.attn_qk_ln or self.clip_qkv or self.softmax_scale:
             self.W_qkv = nn.Linear(self.d_model,
                                    3 * self.d_model,
                                    bias=True,
                                    device=device)
             self.inner_attn = FlashAttention(attention_dropout=cfg.attn_pdrop,
-                                             device=device)
+                                             device=device,
+                                             softmax_scale=self.softmax_scale)
             self.out_proj = nn.Linear(self.d_model,
                                       self.d_model,
                                       bias=True,
@@ -157,7 +159,7 @@ def __init__(self, cfg: DictConfig, device: Optional[str] = None):
     def forward(self, x, key_padding_mask, attn_mask=None):
         assert attn_mask is None
 
-        if self.attn_qk_ln or self.clip_qkv:
+        if self.attn_qk_ln or self.clip_qkv or self.softmax_scale:
             qkv = self.W_qkv(x)
             if self.clip_qkv:
                 qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)

diff --git a/examples/llm/src/models/mosaic_gpt.py b/examples/llm/src/models/mosaic_gpt.py
@@ -48,6 +48,13 @@ def __init__(self, cfg: DictConfig):
                 'QKV clipping only implemented with flash and triton attention.'
             )
 
+        if cfg.get('softmax_scale') and cfg.attn_impl not in [
+                'flash', 'triton'
+        ]:
+            raise NotImplementedError(
+                'softmax_scale only implemented with flash and triton attention.'
+            )
+
         self.alibi = cfg.get('alibi', False)
         self.alibi_bias_max = cfg.get('alibi_bias_max',
                                       8 if self.alibi else None)