FlagOpen · upvenly · May 31, 2023 · Apr 12, 2023 · Apr 12, 2023 · May 10, 2023
diff --git a/training/benchmarks/bert/pytorch/config/_base.py b/training/benchmarks/bert/pytorch/config/_base.py
@@ -174,4 +174,4 @@
 n_device: int = 1
 n_gpu: int = 1
 
-eval_interval_samples: int = 0
+eval_interval_samples: int = 0
diff --git a/training/benchmarks/driver/config_manager.py b/training/benchmarks/driver/config_manager.py
@@ -144,6 +144,7 @@ def activate(base_config,
 
     parsed_params = parse_from_args_and_config(params, cmd_args, ext_config,
                                                enable_extern_config)
+
     # TODO：后续考虑换一个更优雅的方式
     if "tensorflow2" in base_config.__path__:
         base_config.override(parsed_params.__dict__, False)

diff --git a/training/iluvatar/bert-pytorch/README.md b/training/iluvatar/bert-pytorch/README.md
@@ -0,0 +1,40 @@
+## 模型信息
+### 模型介绍
+
+BERT stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications.
+BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).
+
+Please refer to this paper for a detailed description of BERT:
+[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
+
+### 模型Checkpoint下载
+[模型Checkpoint下载](../../benchmarks/bert/pytorch/readme.md#模型信息与数据集模型checkpoint下载)
+### 测试数据集下载
+[测试数据集下载](../../benchmarks/bert/pytorch/readme.md#模型信息与数据集模型checkpoint下载)
+
+### 天数智芯 BI-V100 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: Iluvatar BI-V100 32GB
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本:  4.15.0-156-generic x86_64    
+   - 加速卡驱动版本：3.0.0
+   - Docker 版本：20.10.8
+   - 训练框架版本：torch-1.10.2+corex.3.0.0
+   - 依赖软件版本：无
+
+
+### 运行情况
+| 训练资源 | 配置文件            | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) |
+| -------- | ------------------ | ---------- | ------- | -------  | ------- | --------------- |
+| 单机1卡  | config_BI-V100x1x1 | 17854.76    | 0.72    | 0.7325   | 25000   |17.00            |
+| 单机8卡  | config_BI-V100x1x8 | 20312.57    | 0.72    | 0.9619   | 25000   |118.45           |
+| 两机8卡  | config_BI-V100x2x8 | pending     | 0.72    | pending  | pending |pending          |
+
+### 许可证
+
+本项目基于Apache 2.0 license。
+
+本项目部分代码基于MLCommons https://github.com/mlcommons/training_results_v1.0/tree/master/NVIDIA 实现。
diff --git a/training/iluvatar/bert-pytorch/config/config_Ampere_common.py b/training/iluvatar/bert-pytorch/config/config_Ampere_common.py
@@ -0,0 +1,25 @@
+from config_common import *
+from torch.cuda.amp import GradScaler
+import os
+
+grad_scaler = GradScaler(init_scale=float(os.getenv("INIT_LOSS_SCALE", 2**20)),
+                         growth_interval=2000)
+
+fp16 = True
+ddp_type = "apex"
+dist_backend = "nccl"
+
+#train_batch_size = 56 if get_gpu_mem() > 75 else 27
+train_batch_size = 12
+max_steps = 1000000
+
+fused_gelu_bias = True
+fused_mha = True
+unpad = True
+unpad_fmha = False
+dense_seq_output = True
+exchange_padding = True
+
+dwu_num_rs_pg = 1
+dwu_num_ar_pg = 1
+dwu_num_blocks = 1
diff --git a/training/iluvatar/bert-pytorch/config/config_BI-V100x1x1.py b/training/iluvatar/bert-pytorch/config/config_BI-V100x1x1.py
@@ -0,0 +1,19 @@
+from config_Ampere_common import *
+
+gradient_accumulation_steps = 1
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = False
+exchange_padding = False
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+
+seed = 9031
diff --git a/training/iluvatar/bert-pytorch/config/config_BI-V100x1x2.py b/training/iluvatar/bert-pytorch/config/config_BI-V100x1x2.py
@@ -0,0 +1,18 @@
+from config_Ampere_common import *
+
+gradient_accumulation_steps = 1
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = False
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+
+seed = 9031
diff --git a/training/iluvatar/bert-pytorch/config/config_BI-V100x1x4.py b/training/iluvatar/bert-pytorch/config/config_BI-V100x1x4.py
@@ -0,0 +1,18 @@
+from config_Ampere_common import *
+
+gradient_accumulation_steps = 1
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = False
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+
+seed = 9031
diff --git a/training/iluvatar/bert-pytorch/config/config_BI-V100x1x8.py b/training/iluvatar/bert-pytorch/config/config_BI-V100x1x8.py
@@ -0,0 +1,18 @@
+from config_Ampere_common import *
+
+gradient_accumulation_steps = 1
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = False
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+
+seed = 9031
diff --git a/training/iluvatar/bert-pytorch/config/config_common.py b/training/iluvatar/bert-pytorch/config/config_common.py
@@ -0,0 +1,83 @@
+import torch
+
+# 'segmented' or 'full_iteration' options for CUDA graph capture.
+# 'segmented' option: Pytorch Autograd orchestrates execution of backward ops every iteration.
+# 'full_iteration' option: CUDA graph orchestrates execution of bwd ops every iteration without Autograd involvement (has composability limitations but could be more performant allowing optimizer                              and collectives capture).
+cuda_graph_mode: str = "segmented"
+
+# Maximum number of iterations to capture in a single graph.
+# Requires 'full_iteration' option for '--cuda_graph_mode'.
+max_iterations_per_graph: int = 4
+
+# Whether to do allreduces during gradient accumulation steps.
+allreduce_post_accumulation: bool = False
+
+# Whether to do fp16 allreduce post accumulation.
+allreduce_post_accumulation_fp16: bool = False
+
+# Whether to run with unpadding.
+unpad: bool = False
+
+# Whether to run with unpadding.
+unpad_fmha: bool = False
+
+# Whether to pad tokens.
+pad: bool = False
+
+# Whether to disable fusion of the scaling to BMM1.
+disable_fuse_scale: bool = False
+
+# Whether to disable fusion of the QKV GEMMs.
+disable_fuse_qkv: bool = False
+
+# Whether to disable apex softmax.
+disable_apex_softmax: bool = False
+
+# Enable use of streams for pad case.
+enable_stream: bool = False
+
+# Whether to run with optimizations.
+fused_mha: bool = False
+
+# Enable CUDA graph execution.
+use_cuda_graph: bool = False
+
+# DDP type: 'apex' or 'native'.
+ddp_type: str = "apex"
+
+# Bypass AMP unscaling and inf/nan checks for SOL measurements.
+bypass_amp: bool = False
+
+# Whether to use distributed lamb.
+distributed_lamb: bool = False
+
+# distributed weight update group size. If arg is 0, defaults to one node
+dwu_group_size: int = 0
+
+# number of blocks in dwu scheme
+dwu_num_blocks: int = 4
+
+# number of chunks in dwu scheme
+dwu_num_chunks: int = 1
+
+# number of reduction-scatter streams in dwu scheme
+dwu_num_rs_pg: int = 2
+
+# number of all-reduce streams in dwu scheme
+dwu_num_ar_pg: int = 4
+
+# number of all-gather streams in dwu scheme
+dwu_num_ag_pg: int = 2
+
+# whether to overlap reductions with backprop
+dwu_overlap_reductions: bool = False
+
+# do allgather with e5m2 floats
+dwu_e5m2_allgather: bool = False
+
+# the apex optimization level, value: [O1, O2]
+opt_level: str = "O2"
+
+
+def get_gpu_mem():
+    return torch.cuda.get_device_properties("cuda:0").total_memory / 1e+9
diff --git a/training/iluvatar/bert-pytorch/config/environment_variables.sh b/training/iluvatar/bert-pytorch/config/environment_variables.sh
diff --git a/training/iluvatar/bert-pytorch/config/layers/__init__.py b/training/iluvatar/bert-pytorch/config/layers/__init__.py
@@ -0,0 +1,4 @@
+from .attention import *
+from .fmha import *
+from .mha import *
+from .softmax import *
diff --git a/training/iluvatar/bert-pytorch/config/layers/attention.py b/training/iluvatar/bert-pytorch/config/layers/attention.py
@@ -0,0 +1,81 @@
+from torch import nn
+
+import apex
+from apex.contrib.multihead_attn import SelfMultiheadAttn
+from model.models.modeling import jit_dropout_add, BertSelfOutput
+from .fmha import FMHA
+from .mha import FastUnpadBertSelfAttention
+
+#apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
+import apex.normalization
+#apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
+from apex.contrib.layer_norm import FastLayerNorm as BertLayerNorm
+
+
+# This module uses Apex C++ multihead attention implementation with fusions.
+class FastBertAttention(nn.Module):
+
+    def __init__(self, config):
+        super(FastBertAttention, self).__init__()
+        self.multi_head_attention = SelfMultiheadAttn(
+            config.hidden_size,
+            config.num_attention_heads,
+            dropout=config.attention_probs_dropout_prob,
+            bias=True,
+            include_norm_add=False,
+            impl='fast',
+            separate_qkv_params=True,
+            mask_additive=True)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.p = config.hidden_dropout_prob
+        self.fused_dropout_add = config.fused_dropout_add
+        self.layer_norm = BertLayerNorm(config.hidden_size, eps=1e-12)
+
+    def forward(self, input_tensor, attention_mask, *args, **kwargs):
+        residual = input_tensor
+        multi_head_attention_output, _ = self.multi_head_attention(
+            query=input_tensor,
+            key=input_tensor,
+            value=input_tensor,
+            key_padding_mask=attention_mask,
+            need_weights=True,
+            attn_mask=None,
+            is_training=self.training)
+        if self.fused_dropout_add:
+            attention_output = jit_dropout_add(multi_head_attention_output,
+                                               residual, self.p, self.training)
+            attention_output = self.layer_norm(attention_output)
+            return attention_output
+        else:
+            attention_output = self.dropout(multi_head_attention_output)
+            attention_output = self.layer_norm(attention_output + residual)
+            return attention_output
+
+
+class FastUnpadBertAttention(nn.Module):
+
+    def __init__(self, config):
+        super(FastUnpadBertAttention, self).__init__()
+        if config.unpad_fmha:
+            self.self = FMHA(config)
+        else:
+            self.self = FastUnpadBertSelfAttention(
+                config,
+                enable_stream=config.enable_stream,
+                enable_sync=False,
+                fuse_mask=config.fuse_mask,
+                fuse_scale=config.fuse_scale,
+                fuse_qkv=config.fuse_qkv,
+                fuse_dropout=config.fuse_dropout,
+                apex_softmax=config.apex_softmax,
+                pad=config.pad)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask, seqlen, batch):
+        self_output = self.self(input_tensor,
+                                attention_mask,
+                                seqlen,
+                                batch,
+                                is_training=self.training)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output