fix qwen&baichaun&gpt ci error (#9650)

* fix qwen&baichaun&gpt ci error * fix qwen&baichaun&gpt ci error * fix qwen&baichaun&gpt ci error * fix qwen&baichaun&gpt ci error * fix qwen&baichaun&gpt ci error * add gpt_dy ce * revert qwen&baichuan dygraph ce * revert qwen&baichuan dygraph ce --------- Co-authored-by: xuexixi <xuexixi@baidu.com>
PaddlePaddle · Dec 19, 2024 · 25415fb · 25415fb
1 parent 49d762f
commit 25415fb
Show file tree

Hide file tree

Showing 17 changed files with 299 additions and 26 deletions.
diff --git a/llm/auto_parallel/qwen/run_pretrain_3D_auto.py b/llm/auto_parallel/qwen/run_pretrain_3D_auto.py
@@ -356,6 +356,7 @@ def get_train_data_file(args):
 class PretrainingTrainer(AutoTrainer):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.is_pretraining = True
 
     def _wrap_for_dist_loader(self, train_dataloader):
         dist_loader = super()._wrap_for_dist_loader(train_dataloader)

diff --git a/paddlenlp/transformers/gpt/configuration.py b/paddlenlp/transformers/gpt/configuration.py
@@ -86,6 +86,7 @@
         "eol_token_id": 198,
     },
     "gpt3-13B-en": {  # 13B
+        "architectures": ["GPTForCausalLM"],
         "vocab_size": 50304,
         "hidden_size": 5120,
         "num_hidden_layers": 40,

diff --git a/paddlenlp/transformers/qwen/modeling_3D_auto.py b/paddlenlp/transformers/qwen/modeling_3D_auto.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import math
+import os
 import warnings
 from functools import partial
 from typing import List
@@ -84,6 +84,22 @@ def get_triangle_upper_mask(x, mask=None):
     return mask
 
 
+def enable_fuse_ffn_qkv_pass():
+    if os.getenv("FLAGS_enable_fused_ffn_qkv_pass") in [
+        "True",
+        "true",
+        "1",
+    ]:
+        return True
+    else:
+        return False
+
+
+def get_use_casual_mask():
+    """Get the value of the 'USE_CASUAL_MASK' environment variable."""
+    return os.getenv("USE_CASUAL_MASK", "False") == "True"
+
+
 attention_cnt = 0
 
 
@@ -304,12 +320,20 @@ def __init__(self, config, ipp=None):
         super().__init__()
         ff_dim_in = config.intermediate_size // 2
         self.fuse_attention_ffn = config.fuse_attention_ffn
-        self.w1 = nn.Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias)
-        self.w2 = nn.Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias)
-        self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias_attr=not config.no_bias)
         self.ipp = ipp
-        self.w1.weight = dist.shard_tensor(self.w1.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)])
-        self.w2.weight = dist.shard_tensor(self.w2.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)])
+        if config.fuse_attention_ffn and not enable_fuse_ffn_qkv_pass():
+            self.gate_up_fused_proj = nn.Linear(config.hidden_size, ff_dim_in * 2, bias_attr=not config.no_bias)
+            self.gate_up_fused_proj.weight = dist.shard_tensor(
+                self.gate_up_fused_proj.weight,
+                get_mesh(self.ipp),
+                [dist.Replicate(), dist.Shard(1)],
+            )
+        else:
+            self.w1 = nn.Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias)
+            self.w2 = nn.Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias)
+            self.w1.weight = dist.shard_tensor(self.w1.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)])
+            self.w2.weight = dist.shard_tensor(self.w2.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)])
+        self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias_attr=not config.no_bias)
         self.c_proj.weight = dist.shard_tensor(
             self.c_proj.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(0)]
         )
@@ -321,7 +345,7 @@ def forward(self, hidden_states):
         # a2 = self.w2(hidden_states)
         # intermediate_parallel = a1 * F.silu(a2)
         # down
-        if self.fuse_attention_ffn:
+        if self.fuse_attention_ffn and not enable_fuse_ffn_qkv_pass():
             intermediate_parallel = swiglu(self.gate_up_fused_proj(hidden_states))
         else:
             intermediate_parallel = swiglu(self.w2(hidden_states), self.w1(hidden_states))
@@ -652,16 +676,20 @@ def forward(
 
         hidden_states = inputs_embeds
 
-        # bool 4D mask
-        attention_mask = self.get_masks(
-            input_shape[0], input_shape[1], past_length, dtype=hidden_states.dtype, padding_mask=attention_mask
-        )
-        # TODO(GhostScreaming): how to fix paddle.finfo?
-        zero = paddle.zeros(attention_mask.shape, dtype=paddle.bfloat16)
-        neg_inf = paddle.full_like(attention_mask, paddle.finfo(paddle.bfloat16).min, dtype=paddle.bfloat16)
-        # dtype 4D mask
-        attention_mask = paddle.where(attention_mask, zero, neg_inf)
-        attention_mask = dist.shard_tensor(attention_mask, get_mesh(), [dist.Replicate(), dist.Replicate()])
+        use_casual_mask = get_use_casual_mask()
+        if use_casual_mask:
+            attention_mask = None
+        else:
+            # bool 4D mask
+            attention_mask = self.get_masks(
+                input_shape[0], input_shape[1], past_length, dtype=hidden_states.dtype, padding_mask=attention_mask
+            )
+            # TODO(GhostScreaming): how to fix paddle.finfo?
+            zero = paddle.zeros(attention_mask.shape, dtype=paddle.bfloat16)
+            neg_inf = paddle.full_like(attention_mask, paddle.finfo(paddle.bfloat16).min, dtype=paddle.bfloat16)
+            # dtype 4D mask
+            attention_mask = paddle.where(attention_mask, zero, neg_inf)
+            attention_mask = dist.shard_tensor(attention_mask, get_mesh(), [dist.Replicate(), dist.Replicate()])
         hidden_states = self.drop(hidden_states)
         hidden_states = dist.reshard(hidden_states, get_mesh(), [dist.Shard(0), dist.Replicate()])
         output_shape = input_shape + [

diff --git a/...ph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json b/...ph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json
@@ -9,7 +9,7 @@
   "tensor_parallel_degree": 4,
   "pipeline_parallel_degree": 1,
   "virtual_pp_degree": 1,
-  "sequence_parallel": 1,   
+  "sequence_parallel": 1,
   "sharding_parallel_degree": 8,
   "sharding": "stage1",
   "pipeline_parallel_config": "enable_sharding_comm_overlap enable_release_grads ",

diff --git a/..._parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh b/..._parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh
@@ -0,0 +1,25 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+param="model_item=gpt3-13b_pretrain "
+param+="run_mode=DP1_MP2_PP4_VPP5_Sharding4_Stage1 "
+param+="device_num=N4C32 "
+param+="global_batch_size=128 "
+param+="nnodes=4 "
+param+="model_type=gpt3_13b "
+
+cd ./tests
+bash ./test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh"
diff --git a/...t_tipc/dygraph/hybrid_parallelism/gpt3/auto_config_gpt3_13b/pretrain-gpt3_13b-config.json b/...t_tipc/dygraph/hybrid_parallelism/gpt3/auto_config_gpt3_13b/pretrain-gpt3_13b-config.json
@@ -0,0 +1,45 @@
+{
+    "model_name_or_path": "gpt3-13B-en",
+    "tokenizer_name_or_path": "gpt3-13B-en",
+    "input_dir": "./data",
+    "output_dir": "./output/gpt3-13b_pretrain_ckpts",
+    "split": "949,50,1",
+    "max_seq_length": 4096,
+    "gradient_accumulation_steps": 32,
+    "tensor_parallel_degree": 2,
+    "pipeline_parallel_degree": 4,
+    "virtual_pp_degree": 5,
+    "sequence_parallel": 0,
+    "sharding": "stage1",
+    "pipeline_parallel_config": "enable_sharding_comm_overlap enable_release_grads ",
+    "tensor_parallel_config": "enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add",
+    "per_device_train_batch_size": 1,
+    "use_flash_attention": true,
+    "use_fused_rms_norm": true,
+    "fuse_attention_qkv": true,
+    "use_fused_rope": true,
+    "fuse_attention_ffn": true,
+    "enable_linear_fused_grad_add": true,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "scale_loss": 1024,
+    "learning_rate": 1e-05,
+    "min_learning_rate": 5e-06,
+    "max_steps": 200,
+    "save_steps": 5000,
+    "weight_decay": 0.01,
+    "warmup_ratio": 0.01,
+    "max_grad_norm": 1.0,
+    "logging_steps": 2,
+    "dataloader_num_workers": 1,
+    "eval_steps": 1000,
+    "disable_tqdm": true,
+    "continue_training": 0,
+    "recompute": false,
+    "recompute_granularity": "full_attn",
+    "do_train": true,
+    "pp_recompute_interval": 1,
+    "device": "gpu",
+    "amp_master_grad": true,
+    "sharding_parallel_config": "split_param enable_stage1_overlap enable_stage1_allgather_overlap"
+  }
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/prepare.sh
@@ -0,0 +1,37 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python -m pip install -r ../requirements.txt
+python -m pip install -r ../requirements-dev.txt
+
+# install fused_ln custom ops
+cd ../slm/model_zoo/gpt-3/external_ops/
+python setup.py install
+cd -
+
+python -m pip install tiktoken
+
+# install fast_dataindex
+cd ../llm/
+rm -rf data
+mkdir data
+cd data
+# download data
+wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
+wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
+cd -
+
+# mv autoconfig
+rm -rf auto_config_*
+cp -r ../tests/test_tipc/dygraph/hybrid_parallelism/gpt3/auto_config_* ./
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh
@@ -0,0 +1,134 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test training benchmark for a model.
+# Usage：bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num}
+function _set_params(){
+    model_item=${model_item:-"gpt3-13b_pretrain"}
+    run_mode=${run_mode:-"MP2-PP1"}
+    device_num=${device_num:-"N1C8"}
+    global_batch_size=${global_batch_size:-64}
+    fp_item="bf16"
+    MODEL_TYPE=${model_type:-"gpt3_13b"}
+
+    ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
+    master_ip=${ip_lists[0]}
+    nnodes=${nnodes:-1}
+
+    base_batch_size=${global_batch_size}
+    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
+    model_repo="PaddleNLP"          # (必选) 模型套件的名字
+    speed_unit="tokens/s"         # (必选)速度指标单位
+    skip_steps=10                  # (必选)解析日志，跳过模型前几个性能不稳定的step
+    keyword="interval_tokens_per_second_per_device:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
+    model_mode=5                   # 获取ips数据及单位，仅跳过skip_steps后计算均值，单位保持token/s不变
+
+    # 以下为通用执行命令，无特殊可不用修改
+    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
+    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
+    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
+    mkdir -p $(dirname ${train_log_file})
+
+    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
+    mkdir -p $(dirname ${profiling_log_file})
+
+    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
+    mkdir -p $(dirname ${speed_log_file})
+
+    OUTPUT_PATH=${run_log_path}/output
+}
+
+function _train(){
+    batch_size=${per_device_train_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
+
+    if [ -d $OUTPUT_PATH ]; then
+        rm -rf $OUTPUT_PATH
+    fi
+    mkdir $OUTPUT_PATH
+
+    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
+
+    if [ ${profiling} == "true" ];then
+        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
+        log_file=${profiling_log_file}
+    else
+        add_options=""
+        log_file=${train_log_file}
+    fi
+
+    if [ ${PADDLE_TRAINER_ID} ]; then
+        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
+    else
+        PADDLE_RANK_OPTION=""
+    fi
+
+    distributed_args="--master $master_ip:36677 --nnodes $nnodes ${PADDLE_RANK_OPTION} --run_mode=collective"
+
+    echo "==========System Env============="
+    env
+    echo "================================="
+
+    # 以下为通用执行命令，无特殊可不用修改
+    case ${device_num} in
+    N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
+        train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
+            --nnodes 1 --nproc_per_node 8 \
+            --log_dir mylog run_pretrain.py \
+            ./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-config.json"
+        ;;
+    N4C32) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
+        train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
+            ${distributed_args} --log_dir mylog run_pretrain.py \
+            ./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-config.json"
+        ;;
+    *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
+        train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
+             ${distributed_args} --log_dir mylog run_pretrain.py \
+            ./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-config.json"
+        ;;
+    esac
+    cd ../llm
+    rm -rf mylog && rm -rf checkpoints
+
+    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
+    timeout 40m ${train_cmd} > ${log_file} 2>&1
+
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+    else
+        echo -e "${model_name}, SUCCESS"
+    fi
+
+    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ ${device_num} != "N1C1" -a -d mylog ]; then
+        case_path=$PWD && cd - && mkdir -p mylog      # PaddleNLP/tests/mylog
+        cp -r ${case_path}/mylog/workerlog.* ./mylog/
+    fi
+}
+
+export FLAGS_selected_gpus="0,1,2,3,4,5,6,7"
+export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
+
+source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
+_set_params $@
+#_train       # 如果只产出训练log,不解析,可取消注释
+_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开
diff --git a/...bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh → ...bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh b/...bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh → ...bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh
@@ -15,7 +15,7 @@
 param="model_item=baichuan-inc-baichaun-2-13b_pretrain "
 param+="run_mode=DP1_MP2_PP4_1F1B_Sharding8_Stage2 "
 param+="device_num=N4C32 "
-param+="global_batch_size=32 "
+param+="global_batch_size=128 "
 param+="nnodes=4 "
 param+="model_type=baichuan2_13b "
 

diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh
@@ -27,6 +27,7 @@ python -m pip install fast_dataindex
 # download data
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
+rm -rf data
 mkdir data
 mv llama_openwebtext_100k_ids.npy ./data
 mv llama_openwebtext_100k_idx.npz ./data

diff --git a/...bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh → ...bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh b/...bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh → ...bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh
@@ -15,7 +15,7 @@
 param="model_item=gpt3-13b_pretrain_dy2st "
 param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 "
 param+="device_num=N4C32 "
-param+="global_batch_size=32 "
+param+="global_batch_size=128 "
 param+="nnodes=4 "
 param+="model_type=gpt3_13b "