From db6ab64d39cdac56bbde4339d66b80f4e0b2c22a Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 11 Aug 2022 16:51:49 +0800
Subject: [PATCH 1/4] fused linear and selective recompute

---
 .gitignore                                    |   1 +
 examples/gpt/hybrid_parallel/README.md        |   4 +
 .../gpt/hybrid_parallel/configs_1.3B_dp8.yaml |   4 +-
 .../configs_175B_mp8_pp16.yaml                |   4 +-
 .../configs_6.7B_sharding16.yaml              |   4 +-
 examples/gpt/single/README.md                 |   4 +
 .../gpt/single/configs_1.3B_single_card.yaml  |   4 +-
 .../gpt/single/configs_345m_single_card.yaml  |   4 +-
 examples/gpt/tools.py                         |  13 +++
 fleetx/models/gpt_model/modeling.py           |  68 ++++++++----
 fleetx/models/gpt_model/modeling_hybrid.py    | 104 ++++++++++++++----
 11 files changed, 167 insertions(+), 47 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4f61b8cbb..6246eec82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -101,3 +101,4 @@ ENV/
 .mypy_cache/
 
 .DS_Store
+.idea
diff --git a/examples/gpt/hybrid_parallel/README.md b/examples/gpt/hybrid_parallel/README.md
index 05ccfcd81..56eacd871 100644
--- a/examples/gpt/hybrid_parallel/README.md
+++ b/examples/gpt/hybrid_parallel/README.md
@@ -95,6 +95,7 @@ GPT训练默认使用AdamW优化器以及cosine 学习率衰减，这里通过
   num_train_epochs: 1
   seed: 1024
   use_recompute: False
+  recompute_granularity:
   batch_size:
     global_batch_size: 8
     local_batch_size: 8
@@ -113,6 +114,7 @@ GPT训练默认使用AdamW优化器以及cosine 学习率衰减，这里通过
     save_steps: 1000
     output_dir: ./output
     ckpt_dir: 
+  fused_linear: False 
 ```
 
 其中参数说明：
@@ -124,6 +126,7 @@ GPT训练默认使用AdamW优化器以及cosine 学习率衰减，这里通过
 | num_train_epochs  | 训练的epoch数量                           |
 | seed              | 随机种子，保证训练过程可复现                       |
 | use_recompute     | 是否使用recompute训练                      |
+| recompute_granularity | recompute训练的粒度，可选 `full` `only_attn`，full即recompute全部transformer，only_attn表明只recompute self attention部分 |
 | global_batch_size | 全局的batch size大小，即一次参数更新等效的batch size |
 | local_batch_size  | 每个进程训练的batch size大小                  |
 | micro_batch_size  | 每次前向计算的batch size大小                  |
@@ -138,6 +141,7 @@ GPT训练默认使用AdamW优化器以及cosine 学习率衰减，这里通过
 | save_steps        | 保存模型间隔                               |
 | output_dir        | 指定输出文件                               |
 | ckpt_dir          | checkpoint的加载目录                      |
+| fused_linear      | 是否使用fused_linear代替传统Linear加速训练       |
 
 
 ### 并行维度
diff --git a/examples/gpt/hybrid_parallel/configs_1.3B_dp8.yaml b/examples/gpt/hybrid_parallel/configs_1.3B_dp8.yaml
index 0297c58a9..d4a4f2407 100644
--- a/examples/gpt/hybrid_parallel/configs_1.3B_dp8.yaml
+++ b/examples/gpt/hybrid_parallel/configs_1.3B_dp8.yaml
@@ -5,6 +5,7 @@ PreTraining:
   num_train_epochs: 1
   seed: 1024
   use_recompute: True
+  recompute_granularity: 'only_attn'
   batch_size:
     global_batch_size: 64
     local_batch_size: 8
@@ -22,7 +23,8 @@ PreTraining:
   save_load:
     save_steps: 1000
     output_dir: ./output
-    ckpt_dir: 
+    ckpt_dir:
+  fused_linear: True
 
   Model:
     vocab_size: 50304
diff --git a/examples/gpt/hybrid_parallel/configs_175B_mp8_pp16.yaml b/examples/gpt/hybrid_parallel/configs_175B_mp8_pp16.yaml
index 713b3dae4..3b1042223 100644
--- a/examples/gpt/hybrid_parallel/configs_175B_mp8_pp16.yaml
+++ b/examples/gpt/hybrid_parallel/configs_175B_mp8_pp16.yaml
@@ -5,6 +5,7 @@ PreTraining:
   num_train_epochs: 1
   seed: 1024
   use_recompute: True
+  recompute_granularity: 'only_attn'
   batch_size:
     global_batch_size: 1536
     local_batch_size: 1536
@@ -22,7 +23,8 @@ PreTraining:
   save_load:
     save_steps: 1000
     output_dir: ./output
-    ckpt_dir: 
+    ckpt_dir:
+  fused_linear: True
 
   Model:
     vocab_size: 51200
diff --git a/examples/gpt/hybrid_parallel/configs_6.7B_sharding16.yaml b/examples/gpt/hybrid_parallel/configs_6.7B_sharding16.yaml
index 8d8745cc8..fb34d46ea 100644
--- a/examples/gpt/hybrid_parallel/configs_6.7B_sharding16.yaml
+++ b/examples/gpt/hybrid_parallel/configs_6.7B_sharding16.yaml
@@ -5,6 +5,7 @@ PreTraining:
   num_train_epochs: 1
   seed: 1024
   use_recompute: True
+  recompute_granularity: 'only_attn'
   batch_size:
     global_batch_size: 128
     local_batch_size: 8
@@ -22,7 +23,8 @@ PreTraining:
   save_load:
     save_steps: 1000
     output_dir: ./output
-    ckpt_dir: 
+    ckpt_dir:
+  fused_linear: True
 
   Model:
     vocab_size: 50304
diff --git a/examples/gpt/single/README.md b/examples/gpt/single/README.md
index ba4f5565e..84133c31e 100644
--- a/examples/gpt/single/README.md
+++ b/examples/gpt/single/README.md
@@ -85,6 +85,7 @@ GPT训练默认使用AdamW优化器以及cosine 学习率衰减，这里通过
   num_train_epochs: 1
   seed: 1024
   use_recompute: False
+  recompute_granularity:
   batch_size:
     global_batch_size: 8
     local_batch_size: 8
@@ -103,6 +104,7 @@ GPT训练默认使用AdamW优化器以及cosine 学习率衰减，这里通过
     save_steps: 1000
     output_dir: ./output
     ckpt_dir: 
+  fused_linear: False 
 ```
 
 其中参数说明：
@@ -114,6 +116,7 @@ GPT训练默认使用AdamW优化器以及cosine 学习率衰减，这里通过
 | num_train_epochs  | 训练的epoch数量                           |
 | seed              | 随机种子，保证训练过程可复现                       |
 | use_recompute     | 是否使用recompute训练                      |
+| recompute_granularity | recompute训练的粒度，可选 `full` `only_attn`，full即recompute全部transformer，only_attn表明只recompute self attention部分 |
 | global_batch_size | 全局的batch size大小，即一次参数更新等效的batch size |
 | local_batch_size  | 每个进程训练的batch size大小                  |
 | micro_batch_size  | 每次前向计算的batch size大小                  |
@@ -128,6 +131,7 @@ GPT训练默认使用AdamW优化器以及cosine 学习率衰减，这里通过
 | save_steps        | 保存模型间隔                               |
 | output_dir        | 指定输出文件                               |
 | ckpt_dir          | checkpoint的加载目录                      |
+| fused_linear      | 是否使用fused_linear代替传统Linear加速训练       |
 
 
 ## 运行方式
diff --git a/examples/gpt/single/configs_1.3B_single_card.yaml b/examples/gpt/single/configs_1.3B_single_card.yaml
index b0cdabc19..7c00b1b08 100644
--- a/examples/gpt/single/configs_1.3B_single_card.yaml
+++ b/examples/gpt/single/configs_1.3B_single_card.yaml
@@ -5,6 +5,7 @@ PreTraining:
   num_train_epochs: 1
   seed: 1024
   use_recompute: True
+  recompute_granularity: 'only_attn'
   batch_size:
     global_batch_size: 8
     local_batch_size: 8
@@ -22,7 +23,8 @@ PreTraining:
   save_load:
     save_steps: 1000
     output_dir: ./output
-    ckpt_dir: 
+    ckpt_dir:
+  fused_linear: True
 
   Model:
     vocab_size: 50304
diff --git a/examples/gpt/single/configs_345m_single_card.yaml b/examples/gpt/single/configs_345m_single_card.yaml
index 27c9cf9ac..07c8754ce 100644
--- a/examples/gpt/single/configs_345m_single_card.yaml
+++ b/examples/gpt/single/configs_345m_single_card.yaml
@@ -5,6 +5,7 @@ PreTraining:
   num_train_epochs: 1
   seed: 1024
   use_recompute: False
+  recompute_granularity:
   batch_size:
     global_batch_size: 8
     local_batch_size: 8
@@ -22,7 +23,8 @@ PreTraining:
   save_load:
     save_steps: 1000
     output_dir: ./output
-    ckpt_dir: 
+    ckpt_dir:
+  fused_linear: True
 
   Model:
     vocab_size: 50304
diff --git a/examples/gpt/tools.py b/examples/gpt/tools.py
index d1e1488da..8be75e04f 100644
--- a/examples/gpt/tools.py
+++ b/examples/gpt/tools.py
@@ -22,6 +22,7 @@
 import yaml
 import paddle
 import paddle.distributed as dist
+from paddle.fluid import core
 import argparse
 from fleetx.datasets.gpt import create_pretrained_dataset, get_train_data_file
 
@@ -49,6 +50,13 @@ def process_batch_size(args):
     assert args.local_batch_size % args.micro_batch_size == 0
 
 
+def is_fused_matmul_bias_supported():
+    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
+        return hasattr(core.ops, 'fused_gemm_epilogue')
+    else:
+        return False
+
+
 def model_size(args):
     """
     get model size for transformer
@@ -84,6 +92,11 @@ def add_dict(config, k, v):
 
     args.test_iters = args.eval_iters * 10
 
+    if args.fused_linear:
+        assert is_fused_matmul_bias_supported(), \
+            "The flag fused_linear only valid for cuda version higher than 11.6, "\
+            "but the paddle is compiled with cuda " + paddle.version.cuda()
+
     # process batch size
     process_batch_size(args)
 
diff --git a/fleetx/models/gpt_model/modeling.py b/fleetx/models/gpt_model/modeling.py
index 812a6e0dd..bb6d7d237 100644
--- a/fleetx/models/gpt_model/modeling.py
+++ b/fleetx/models/gpt_model/modeling.py
@@ -24,6 +24,7 @@
 import paddle.incubate as incubate
 from paddle.distributed.fleet.utils import recompute
 from .config import configurable
+from paddle.incubate.nn import FusedLinear
 
 
 class MultiHeadAttention(nn.Layer):
@@ -46,7 +47,8 @@ def __init__(self,
                  need_weights=False,
                  weight_attr=None,
                  bias_attr=None,
-                 fuse=True):
+                 fuse=True,
+                 fused_linear=False):
         super(MultiHeadAttention, self).__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
@@ -59,19 +61,21 @@ def __init__(self,
         self.head_dim = embed_dim // num_heads
         assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
 
+        Linear = FusedLinear if fused_linear else nn.Linear
+
         if self.fuse:
             assert self.kdim == embed_dim
             assert self.vdim == embed_dim
-            self.qkv_proj = nn.Linear(
+            self.qkv_proj = Linear(
                 embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr)
         else:
-            self.q_proj = nn.Linear(
+            self.q_proj = Linear(
                 embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
-            self.k_proj = nn.Linear(
+            self.k_proj = Linear(
                 self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
-            self.v_proj = nn.Linear(
+            self.v_proj = Linear(
                 self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
-        self.out_proj = nn.Linear(
+        self.out_proj = Linear(
             embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
 
     def _fuse_prepare_qkv(self, query):
@@ -221,13 +225,15 @@ def __init__(self,
                  num_layers,
                  norm=None,
                  hidden_size=None,
-                 use_recompute=False):
+                 use_recompute=False,
+                 recompute_granularity="full"):
         super(TransformerDecoder, self).__init__()
 
         self.num_layers = num_layers
         self.layers = decoder_layers
         self.norm = norm
         self.use_recompute = use_recompute
+        self.recompute_granularity = recompute_granularity
         if norm == "LayerNorm":
             self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5)
         elif norm is not None:
@@ -258,9 +264,11 @@ def forward(self,
                                             cache=cache)
                     new_caches.append(new_cache)
                 else:
-                    output = recompute(mod, output, memory, tgt_mask, use_cache, cache) if self.use_recompute \
-                        else mod(output, memory, tgt_mask, use_cache, cache)
-
+                    if self.use_recompute and self.recompute_granularity == "full":
+                        output = recompute(mod, output, memory, tgt_mask, use_cache, cache)
+                    else:
+                        recompute_attn = self.use_recompute and self.recompute_granularity == "only_attn"
+                        output = mod(output, memory, tgt_mask, use_cache, cache, recompute_attn)
             else:
                 output, new_cache = mod(output,
                                         memory,
@@ -304,7 +312,8 @@ def __init__(self,
                  act_dropout=None,
                  normalize_before=True,
                  weight_attr=None,
-                 bias_attr=None):
+                 bias_attr=None,
+                 fused_linear=False):
         self._config = locals()
         self._config.pop("self")
         self._config.pop("__class__", None)  # py3
@@ -317,15 +326,18 @@ def __init__(self,
         weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
 
+        Linear = FusedLinear if fused_linear else nn.Linear
+
         self.self_attn = MultiHeadAttention(
             d_model,
             nhead,
             dropout=attn_dropout,
             weight_attr=weight_attrs[0],
-            bias_attr=bias_attrs[0])
-        self.linear1 = nn.Linear(
+            bias_attr=bias_attrs[0],
+            fused_linear=fused_linear)
+        self.linear1 = Linear(
             d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
-        self.linear2 = nn.Linear(
+        self.linear2 = Linear(
             dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
 
         self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
@@ -334,14 +346,17 @@ def __init__(self,
         self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
         self.activation = getattr(F, activation)
 
-    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
+    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None, recompute_attn=False):
         residual = tgt
 
         if self.normalize_before:
             tgt = self.norm1(tgt)
 
         if use_cache is False:
-            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
+            if recompute_attn:
+                tgt = recompute(self.self_attn, tgt, None, None, tgt_mask, use_cache, cache)
+            else:
+                tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
         else:
             tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                     use_cache, cache)
@@ -421,10 +436,19 @@ def __init__(self,
                  max_position_embeddings=512,
                  type_vocab_size=16,
                  use_recompute=False,
-                 initializer_range=0.02):
+                 initializer_range=0.02,
+                 fused_linear=False,
+                 recompute_granularity="full"):
 
         super(GPTModel, self).__init__()
 
+        if use_recompute:
+            if recompute_granularity is None:
+                recompute_granularity = "full"
+            assert recompute_granularity in ["full", "only_attn"], \
+                "recompute_granularity can be only chosen from None, " \
+                "full or only_attn, but received " + recompute_granularity
+
         self.initializer_range = initializer_range
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
@@ -447,14 +471,16 @@ def __init__(self,
                     weight_attr=paddle.ParamAttr(
                         initializer=nn.initializer.Normal(
                             mean=0.0, std=self.initializer_range)),
-                    bias_attr=None))
+                    bias_attr=None,
+                    fused_linear=fused_linear))
 
         self.decoder = TransformerDecoder(
             decoder_layers,
             num_layers,
             norm="LayerNorm",
             hidden_size=hidden_size,
-            use_recompute=use_recompute)
+            use_recompute=use_recompute,
+            recompute_granularity=recompute_granularity)
 
     @classmethod
     def from_config(cls, cfg):
@@ -469,7 +495,9 @@ def from_config(cls, cfg):
             "max_position_embeddings": cfg.max_position_embeddings,
             "type_vocab_size": cfg.type_vocab_size,
             "initializer_range": cfg.initializer_range,
-            "use_recompute": cfg.use_recompute
+            "use_recompute": cfg.use_recompute,
+            "fused_linear": cfg.fused_linear,
+            "recompute_granularity": cfg.recompute_granularity
         }
 
     def forward(self,
diff --git a/fleetx/models/gpt_model/modeling_hybrid.py b/fleetx/models/gpt_model/modeling_hybrid.py
index b46b5566e..ecb97bcc3 100644
--- a/fleetx/models/gpt_model/modeling_hybrid.py
+++ b/fleetx/models/gpt_model/modeling_hybrid.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 
 import collections
+import logging
+
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
@@ -76,7 +78,8 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  fuse=True,
-                 num_partitions=1):
+                 num_partitions=1,
+                 fused_linear=False):
         super(MultiHeadAttention, self).__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
@@ -102,35 +105,40 @@ def __init__(self,
                 3 * embed_dim,
                 weight_attr=weight_attr,
                 has_bias=True,
-                gather_output=False)
+                gather_output=False,
+                fuse_matmul_bias=fused_linear)
         else:
             self.q_proj = fleet.meta_parallel.ColumnParallelLinear(
                 embed_dim,
                 embed_dim,
                 weight_attr=weight_attr,
                 has_bias=True,
-                gather_output=False)
+                gather_output=False,
+                fuse_matmul_bias=fused_linear)
 
             self.k_proj = fleet.meta_parallel.ColumnParallelLinear(
                 self.kdim,
                 embed_dim,
                 weight_attr=weight_attr,
                 has_bias=True,
-                gather_output=False)
+                gather_output=False,
+                fuse_matmul_bias=fused_linear)
 
             self.v_proj = fleet.meta_parallel.ColumnParallelLinear(
                 self.vdim,
                 embed_dim,
                 weight_attr=weight_attr,
                 has_bias=True,
-                gather_output=False)
+                gather_output=False,
+                fuse_matmul_bias=fused_linear)
 
         self.out_proj = fleet.meta_parallel.RowParallelLinear(
             embed_dim,
             embed_dim,
             weight_attr=weight_attr,
             has_bias=True,
-            input_is_parallel=True)
+            input_is_parallel=True,
+            fuse_matmul_bias=fused_linear)
 
     def _fuse_prepare_qkv(self, query):
         mix_layer = self.qkv_proj(query)
@@ -280,13 +288,15 @@ def __init__(self,
                  num_layers,
                  norm=None,
                  hidden_size=None,
-                 use_recompute=False):
+                 use_recompute=False,
+                 recompute_granularity="full"):
         super(TransformerDecoder, self).__init__()
 
         self.num_layers = num_layers
         self.layers = decoder_layers
         self.norm = norm
         self.use_recompute = use_recompute
+        self.recompute_granularity = recompute_granularity
         if norm == "LayerNorm":
             self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5)
         elif norm is not None:
@@ -317,8 +327,10 @@ def forward(self,
                                             cache=cache)
                     new_caches.append(new_cache)
                 else:
-                    output = recompute(mod, output, memory, tgt_mask, use_cache, cache) if self.use_recompute \
-                        else mod(output, memory, tgt_mask, use_cache, cache)
+                    if self.use_recompute and self.recompute_granularity == "full":
+                        output = recompute(mod, output, memory, tgt_mask, use_cache, cache)
+                    else:
+                        output = mod(output, memory, tgt_mask, use_cache, cache)
 
             else:
                 output, new_cache = mod(output,
@@ -364,7 +376,9 @@ def __init__(self,
                  normalize_before=True,
                  weight_attr=None,
                  bias_attr=None,
-                 num_partitions=1):
+                 num_partitions=1,
+                 fused_linear=False,
+                 recompute_attn=False):
         self._config = locals()
         self._config.pop("self")
         self._config.pop("__class__", None)  # py3
@@ -373,6 +387,7 @@ def __init__(self,
         attn_dropout = dropout if attn_dropout is None else attn_dropout
         act_dropout = dropout if act_dropout is None else act_dropout
         self.normalize_before = normalize_before
+        self.recompute_attn = recompute_attn
 
         weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
@@ -383,21 +398,24 @@ def __init__(self,
             dropout=attn_dropout,
             weight_attr=weight_attrs[0],
             bias_attr=bias_attrs[0],
-            num_partitions=num_partitions)
+            num_partitions=num_partitions,
+            fused_linear=fused_linear)
 
         self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
             d_model,
             dim_feedforward,
             weight_attr=weight_attrs[2],
             gather_output=False,
-            has_bias=True)
+            has_bias=True,
+            fuse_matmul_bias=fused_linear)
 
         self.linear2 = fleet.meta_parallel.RowParallelLinear(
             dim_feedforward,
             d_model,
             weight_attr=weight_attrs[2],
             input_is_parallel=True,
-            has_bias=True)
+            has_bias=True,
+            fuse_matmul_bias=fused_linear)
 
         self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
         self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
@@ -417,7 +435,10 @@ def forward(self,
             tgt = self.norm1(tgt)
 
         if use_cache is False:
-            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
+            if self.recompute_attn:
+                tgt = recompute(self.self_attn, tgt, None, None, tgt_mask, use_cache, cache)
+            else:
+                tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
         else:
             tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                     use_cache, cache)
@@ -505,10 +526,23 @@ def __init__(self,
                  type_vocab_size=16,
                  initializer_range=0.02,
                  num_partitions=1,
-                 use_recompute=False):
+                 use_recompute=False,
+                 fused_linear=False,
+                 recompute_granularity="full"):
 
         super(GPTModel, self).__init__()
 
+        if use_recompute:
+            if not isinstance(recompute_granularity, str):
+                logging.Logger("You are using recompute but not set recompute granularity, "
+                               "the granularity will be set to full as default.")
+                recompute_granularity = "full"
+            assert recompute_granularity in ["full", "only_attn"], \
+                "recompute_granularity can be only chosen from " \
+                "full or only_attn, but received " + recompute_granularity
+
+        recompute_attn = use_recompute and recompute_granularity == "only_attn"
+
         self.initializer_range = initializer_range
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
@@ -532,14 +566,17 @@ def __init__(self,
                         initializer=nn.initializer.Normal(
                             mean=0.0, std=self.initializer_range)),
                     bias_attr=None,
-                    num_partitions=num_partitions))
+                    num_partitions=num_partitions,
+                    fused_linear=fused_linear,
+                    recompute_attn=recompute_attn))
 
         self.decoder = TransformerDecoder(
             decoder_layers,
             num_layers,
             norm="LayerNorm",
             hidden_size=hidden_size,
-            use_recompute=use_recompute)
+            use_recompute=use_recompute,
+            recompute_granularity=recompute_granularity)
 
     @classmethod
     def from_config(cls, cfg):
@@ -555,7 +592,9 @@ def from_config(cls, cfg):
             "type_vocab_size": cfg.type_vocab_size,
             "initializer_range": cfg.initializer_range,
             "num_partitions": cfg.mp_degree,
-            "use_recompute": cfg.use_recompute
+            "use_recompute": cfg.use_recompute,
+            "fused_linear": cfg.fused_linear,
+            "recompute_granularity": cfg.recompute_granularity
         }
 
     def forward(self,
@@ -727,7 +766,20 @@ def __init__(self,
                  initializer_range=0.02,
                  num_partitions=1,
                  topology=None,
-                 use_recompute=False):
+                 use_recompute=False,
+                 fused_linear=False,
+                 recompute_granularity="full"):
+
+        if use_recompute:
+            if not isinstance(recompute_granularity, str):
+                logging.Logger("You are using recompute but not set recompute granularity, "
+                               "the granularity will be set to full as default.")
+                recompute_granularity = "full"
+            assert recompute_granularity in ["full", "only_attn"], \
+                "recompute_granularity can be only chosen from " \
+                "full or only_attn, but received " + recompute_granularity
+
+        recompute_attn = use_recompute and recompute_granularity == "only_attn"
 
         # forward desc
         self.descs = []
@@ -759,7 +811,9 @@ def __init__(self,
                         initializer=nn.initializer.Normal(
                             mean=0.0, std=initializer_range)),
                     bias_attr=None,
-                    num_partitions=num_partitions))
+                    num_partitions=num_partitions,
+                    fused_linear=fused_linear,
+                    recompute_attn=recompute_attn))
 
         self.descs.append(
             LayerDesc(
@@ -781,12 +835,16 @@ def _logits_helper(embedding, output):
                 type_vocab_size=type_vocab_size,
                 initializer_range=0.02))
 
+        recompute_interval = 0
+        if recompute and not recompute_attn:
+            recompute_interval = 1
+
         super().__init__(
             layers=self.descs,
             loss_fn=GPTPretrainingCriterionPipe(),
             topology=topology,
             seg_method="layer:TransformerDecoderLayer",
-            recompute_interval=1 if use_recompute else 0,
+            recompute_interval=recompute_interval,
             recompute_partition=False,
             recompute_offload=False)
 
@@ -805,5 +863,7 @@ def from_config(cls, cfg):
             "initializer_range": cfg.initializer_range,
             "num_partitions": cfg.mp_degree,
             "use_recompute": cfg.use_recompute,
-            "topology": cfg.topology
+            "topology": cfg.topology,
+            "fused_linear": cfg.fused_linear,
+            "recompute_granularity": cfg.recompute_granularity
         }

From b840d6df97f183c46c92b6da3317beb30ba859ae Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 11 Aug 2022 17:02:58 +0800
Subject: [PATCH 2/4] make all the same

---
 fleetx/models/gpt_model/modeling.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/fleetx/models/gpt_model/modeling.py b/fleetx/models/gpt_model/modeling.py
index bb6d7d237..13ece3e2c 100644
--- a/fleetx/models/gpt_model/modeling.py
+++ b/fleetx/models/gpt_model/modeling.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 
 import collections
+import logging
+
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
@@ -267,8 +269,7 @@ def forward(self,
                     if self.use_recompute and self.recompute_granularity == "full":
                         output = recompute(mod, output, memory, tgt_mask, use_cache, cache)
                     else:
-                        recompute_attn = self.use_recompute and self.recompute_granularity == "only_attn"
-                        output = mod(output, memory, tgt_mask, use_cache, cache, recompute_attn)
+                        output = mod(output, memory, tgt_mask, use_cache, cache)
             else:
                 output, new_cache = mod(output,
                                         memory,
@@ -313,7 +314,8 @@ def __init__(self,
                  normalize_before=True,
                  weight_attr=None,
                  bias_attr=None,
-                 fused_linear=False):
+                 fused_linear=False,
+                 recompute_attn=False):
         self._config = locals()
         self._config.pop("self")
         self._config.pop("__class__", None)  # py3
@@ -322,6 +324,7 @@ def __init__(self,
         attn_dropout = dropout if attn_dropout is None else attn_dropout
         act_dropout = dropout if act_dropout is None else act_dropout
         self.normalize_before = normalize_before
+        self.recompute_attn = recompute_attn
 
         weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
@@ -346,14 +349,14 @@ def __init__(self,
         self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
         self.activation = getattr(F, activation)
 
-    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None, recompute_attn=False):
+    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
         residual = tgt
 
         if self.normalize_before:
             tgt = self.norm1(tgt)
 
         if use_cache is False:
-            if recompute_attn:
+            if self.recompute_attn:
                 tgt = recompute(self.self_attn, tgt, None, None, tgt_mask, use_cache, cache)
             else:
                 tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
@@ -443,12 +446,16 @@ def __init__(self,
         super(GPTModel, self).__init__()
 
         if use_recompute:
-            if recompute_granularity is None:
+            if not isinstance(recompute_granularity, str):
+                logging.Logger("You are using recompute but not set recompute granularity, "
+                               "the granularity will be set to full as default.")
                 recompute_granularity = "full"
             assert recompute_granularity in ["full", "only_attn"], \
-                "recompute_granularity can be only chosen from None, " \
+                "recompute_granularity can be only chosen from " \
                 "full or only_attn, but received " + recompute_granularity
 
+        recompute_attn = use_recompute and recompute_granularity == "only_attn"
+
         self.initializer_range = initializer_range
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
@@ -472,7 +479,8 @@ def __init__(self,
                         initializer=nn.initializer.Normal(
                             mean=0.0, std=self.initializer_range)),
                     bias_attr=None,
-                    fused_linear=fused_linear))
+                    fused_linear=fused_linear,
+                    recompute_attn=recompute_attn))
 
         self.decoder = TransformerDecoder(
             decoder_layers,

From 4d02bccabe9f0b25534fdbff0cc42480f92a4e0b Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 11 Aug 2022 17:09:29 +0800
Subject: [PATCH 3/4] move args checker to tools

---
 examples/gpt/tools.py                      | 11 +++++++++++
 fleetx/models/gpt_model/modeling.py        |  9 ---------
 fleetx/models/gpt_model/modeling_hybrid.py | 18 ------------------
 3 files changed, 11 insertions(+), 27 deletions(-)

diff --git a/examples/gpt/tools.py b/examples/gpt/tools.py
index 8be75e04f..51963deed 100644
--- a/examples/gpt/tools.py
+++ b/examples/gpt/tools.py
@@ -97,6 +97,17 @@ def add_dict(config, k, v):
             "The flag fused_linear only valid for cuda version higher than 11.6, "\
             "but the paddle is compiled with cuda " + paddle.version.cuda()
 
+    if args.recompute:
+        assert args.recompute_granularity is None or \
+               isinstance(args.recompute_granularity, str), \
+            "recompute_granularity must be a None or a string object"
+        if args.recompute_granularity is None:
+            args.recompute_granularity = "full"
+        else:
+            assert args.recompute_granularity in ["full", "only_attn"], \
+                "recompute_granularity can be only chosen from " \
+                "full or only_attn, but received " + args.recompute_granularity
+
     # process batch size
     process_batch_size(args)
 
diff --git a/fleetx/models/gpt_model/modeling.py b/fleetx/models/gpt_model/modeling.py
index 13ece3e2c..83fff2bcc 100644
--- a/fleetx/models/gpt_model/modeling.py
+++ b/fleetx/models/gpt_model/modeling.py
@@ -445,15 +445,6 @@ def __init__(self,
 
         super(GPTModel, self).__init__()
 
-        if use_recompute:
-            if not isinstance(recompute_granularity, str):
-                logging.Logger("You are using recompute but not set recompute granularity, "
-                               "the granularity will be set to full as default.")
-                recompute_granularity = "full"
-            assert recompute_granularity in ["full", "only_attn"], \
-                "recompute_granularity can be only chosen from " \
-                "full or only_attn, but received " + recompute_granularity
-
         recompute_attn = use_recompute and recompute_granularity == "only_attn"
 
         self.initializer_range = initializer_range
diff --git a/fleetx/models/gpt_model/modeling_hybrid.py b/fleetx/models/gpt_model/modeling_hybrid.py
index ecb97bcc3..578dd9cee 100644
--- a/fleetx/models/gpt_model/modeling_hybrid.py
+++ b/fleetx/models/gpt_model/modeling_hybrid.py
@@ -532,15 +532,6 @@ def __init__(self,
 
         super(GPTModel, self).__init__()
 
-        if use_recompute:
-            if not isinstance(recompute_granularity, str):
-                logging.Logger("You are using recompute but not set recompute granularity, "
-                               "the granularity will be set to full as default.")
-                recompute_granularity = "full"
-            assert recompute_granularity in ["full", "only_attn"], \
-                "recompute_granularity can be only chosen from " \
-                "full or only_attn, but received " + recompute_granularity
-
         recompute_attn = use_recompute and recompute_granularity == "only_attn"
 
         self.initializer_range = initializer_range
@@ -770,15 +761,6 @@ def __init__(self,
                  fused_linear=False,
                  recompute_granularity="full"):
 
-        if use_recompute:
-            if not isinstance(recompute_granularity, str):
-                logging.Logger("You are using recompute but not set recompute granularity, "
-                               "the granularity will be set to full as default.")
-                recompute_granularity = "full"
-            assert recompute_granularity in ["full", "only_attn"], \
-                "recompute_granularity can be only chosen from " \
-                "full or only_attn, but received " + recompute_granularity
-
         recompute_attn = use_recompute and recompute_granularity == "only_attn"
 
         # forward desc

From 49ad9e06248216c76a5f2f996e1eddb8f16e8360 Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 11 Aug 2022 17:17:50 +0800
Subject: [PATCH 4/4] fix fused linear

---
 examples/gpt/hybrid_parallel/README.md | 2 +-
 examples/gpt/single/README.md          | 2 +-
 examples/gpt/tools.py                  | 9 +++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/gpt/hybrid_parallel/README.md b/examples/gpt/hybrid_parallel/README.md
index 56eacd871..22d3a7f37 100644
--- a/examples/gpt/hybrid_parallel/README.md
+++ b/examples/gpt/hybrid_parallel/README.md
@@ -141,7 +141,7 @@ GPT训练默认使用AdamW优化器以及cosine 学习率衰减，这里通过
 | save_steps        | 保存模型间隔                               |
 | output_dir        | 指定输出文件                               |
 | ckpt_dir          | checkpoint的加载目录                      |
-| fused_linear      | 是否使用fused_linear代替传统Linear加速训练       |
+| fused_linear      | 是否使用fused_linear代替传统Linear加速训练。注：该功能需要cuda 11.6及以上编译的paddle支持。       |
 
 
 ### 并行维度
diff --git a/examples/gpt/single/README.md b/examples/gpt/single/README.md
index 84133c31e..22f3c0828 100644
--- a/examples/gpt/single/README.md
+++ b/examples/gpt/single/README.md
@@ -131,7 +131,7 @@ GPT训练默认使用AdamW优化器以及cosine 学习率衰减，这里通过
 | save_steps        | 保存模型间隔                               |
 | output_dir        | 指定输出文件                               |
 | ckpt_dir          | checkpoint的加载目录                      |
-| fused_linear      | 是否使用fused_linear代替传统Linear加速训练       |
+| fused_linear      | 是否使用fused_linear代替传统Linear加速训练。注：该功能需要cuda 11.6及以上编译的paddle支持。       |
 
 
 ## 运行方式
diff --git a/examples/gpt/tools.py b/examples/gpt/tools.py
index 51963deed..fcf1dfff4 100644
--- a/examples/gpt/tools.py
+++ b/examples/gpt/tools.py
@@ -16,6 +16,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import logging
 import os
 import sys
 
@@ -92,10 +93,10 @@ def add_dict(config, k, v):
 
     args.test_iters = args.eval_iters * 10
 
-    if args.fused_linear:
-        assert is_fused_matmul_bias_supported(), \
-            "The flag fused_linear only valid for cuda version higher than 11.6, "\
-            "but the paddle is compiled with cuda " + paddle.version.cuda()
+    if args.fused_linear and not is_fused_matmul_bias_supported():
+        args.fused_linear = False
+        logging.warning("The flag fused_linear only valid for cuda version higher than 11.6, "
+                        "but the paddle is compiled with cuda " + paddle.version.cuda())
 
     if args.recompute:
         assert args.recompute_granularity is None or \