merge develop

li126com · Jul 16, 2024 · b930e6b · b930e6b
2 parents e04712e + aa3e9c4
commit b930e6b
Show file tree

Hide file tree

Showing 153 changed files with 12,938 additions and 6,448 deletions.
diff --git a/.github/workflows/monthly_test.yaml b/.github/workflows/monthly_test.yaml
@@ -23,7 +23,7 @@ jobs:
       run: |
         source activate ${evo_env_torch21_flash2}
         jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 8 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_msp" ./tests/test_training/test_norm_weight.py
+        srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_msp" ./tests/test_training/test_norm_weight.py
         exit_code=$?
         sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
@@ -42,7 +42,7 @@ jobs:
       run: |
         source activate ${evo_env_torch21_flash2}
         jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 8 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_fsp" ./tests/test_training/test_norm_weight.py
+        srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_fsp" ./tests/test_training/test_norm_weight.py
         exit_code=$?
         sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
@@ -61,7 +61,7 @@ jobs:
       run: |
         source activate ${evo_env_torch21_flash2}
         jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 8 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_isp" ./tests/test_training/test_norm_weight.py
+        srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_isp" ./tests/test_training/test_norm_weight.py
         exit_code=$?
         sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 

diff --git a/.github/workflows/upload_to_pypi.yaml b/.github/workflows/upload_to_pypi.yaml
@@ -27,17 +27,31 @@ jobs:
       run: |
         pip install setuptools wheel twine
 
-    - name: get latest tag
-      run: |
-        latest_tag=$(git describe --tags --abbrev=0)
-        echo "$latest_tag" > version.txt
-
     - name: build and upload package
       run: |
         source activate ${evo_env_torch21_flash2}
+        python_path=$(which python) && echo "Python executable is at: $python_path"
+        latest_tag=$(git describe --tags --abbrev=0)
+        echo "$latest_tag" > version.txt
         export PYTHONPATH=$PWD:$PYTHONPATH
+        export LLMPLATFORM=/mnt/petrelfs/share_data/llm_env
+        export CUDA_PATH=${LLMPLATFORM}/dep/cuda-11.8
+        export GCC_HOME=${LLMPLATFORM}/dep/gcc-10.2.0
+        export MPFR_HOME=${LLMPLATFORM}/dep/mpfr-4.1.0
+        export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${CUDA_PATH}/extras/CUPTI/lib64/:$LD_LIBRARY_PATH
+        export LD_LIBRARY_PATH=${GCC_HOME}/lib64:$LD_LIBRARY_PATH
+        export LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
+        export CC=${GCC_HOME}/bin/gcc
+        export CXX=${GCC_HOME}/bin/c++
         jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
+        cd csrc/rotary/
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
+        cd ../xentropy/
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
+        cd ../../
         exit_code=$?
         twine upload -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} dist/*
+        twine upload -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} csrc/rotary/dist/*
+        twine upload -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} csrc/xentropy/dist/*
         sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml
@@ -177,6 +177,8 @@ jobs:
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
     - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
     - name: training_8GPU_ISP
       run: |
@@ -195,6 +197,8 @@ jobs:
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
     - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
     - name: training_8GPU_ISP_CKPT
       run: |

diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py
@@ -146,6 +146,14 @@
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
     use_flash_attn=True,
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
     num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
     num_experts=4,
     moe_use_residual=False,

diff --git a/configs/7B_internlm2.py b/configs/7B_internlm2.py
@@ -1,5 +1,5 @@
 JOB_NAME = "7b_internlm2_train"
-model_type="INTERNLM2_PUBLIC"
+model_type = "INTERNLM2_PUBLIC"
 DO_ALERT = False
 
 VOCAB_SIZE = 92544
@@ -144,6 +144,14 @@
     layer_norm_epsilon=1e-5,
     num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
     use_flash_attn=True,
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
 )
 
 """
@@ -197,3 +205,18 @@
 # metric_dtype can be "fp32" or other string
 # only when set to "fp32" will use fp32 to calc in metrics
 # metric_dtype = "fp32"
+
+generation = dict(
+    ckpt_folder="/path/to/saved/ckpt",
+    output_folder="/path/to/save/generation",
+    batch_size=1,
+    eos_id=[2, 0],
+    bos_id=1,
+    max_length=100,
+    do_sample=True,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    repetition_penalty=1,
+    length_penalty=1.0,
+)
diff --git a/configs/7B_isp_sft.py b/configs/7B_isp_sft.py
@@ -146,6 +146,14 @@
     layer_norm_epsilon=1e-5,
     use_flash_attn=True,
     num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
 )
 """
 zero1 parallel (dict):

diff --git a/configs/7B_llama2.py b/configs/7B_llama2.py
@@ -6,8 +6,8 @@
 SEQ_LEN = 2048
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
-NUM_KV_ATTENTION_HEAD = 8
-MLP_RATIO = 3.5
+NUM_KV_ATTENTION_HEAD = 32
+MLP_RATIO = 2.6875
 NUM_LAYER = 32
 
 
@@ -144,6 +144,14 @@
     layer_norm_epsilon=1e-5,
     num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
     use_flash_attn=True,
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
 )
 
 """

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
@@ -71,6 +71,10 @@
     valid_folder=VALID_FOLDER,
     empty_cache_and_diag_interval=200,
     diag_outlier_ratio=1.1,
+    # whether use shared memory to load meta files
+    use_shm=False,
+    # when use shm, the default shm_path is "/dev/shm/metacache"
+    # shm_path="/dev/shm/metacache"
 )
 
 grad_scaler = dict(
@@ -100,6 +104,11 @@
     reduce_bucket_size=512 * 1024 * 1024,
     # grad clipping
     clip_grad_norm=1.0,
+    # whether use new optm
+    use_split_tensor_optim=False,
+    # when use split tensor optm
+    # Perform all gather with a set of parameters of all_gather_size
+    all_gather_size=512 * 1024 * 1024,
 )
 
 loss = dict(
@@ -145,6 +154,14 @@
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
     use_flash_attn=True,
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
     num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
 )
 """

diff --git a/configs/_base_/models/internlm2_1B.py b/configs/_base_/models/internlm2_1B.py
@@ -25,7 +25,7 @@
     mlp_ratio=MLP_RATIO,
     multiple_of=MULTIPLE_OF,
     norm_type="rmsnorm",
-    adapt_hf=True,
+    qk_interleaved=False,
     apply_post_layer_norm=False,
     no_bias=True,
     layer_norm_epsilon=1e-5,

diff --git a/configs/_base_/models/internlm2_20B.py b/configs/_base_/models/internlm2_20B.py
@@ -23,7 +23,7 @@
     num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
     mlp_ratio=MLP_RATIO,
     norm_type="rmsnorm",
-    adapt_hf=True,
+    qk_interleaved=False,
     apply_post_layer_norm=False,
     no_bias=True,
     layer_norm_epsilon=1e-5,

diff --git a/configs/_base_/models/internlm2_7B.py b/configs/_base_/models/internlm2_7B.py
@@ -23,7 +23,7 @@
     num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
     mlp_ratio=MLP_RATIO,
     norm_type="rmsnorm",
-    adapt_hf=False,
+    qk_interleaved=True,
     apply_post_layer_norm=False,
     no_bias=True,
     layer_norm_epsilon=1e-5,

diff --git a/csrc/rotary/rotary.cpp b/csrc/rotary/rotary.cpp
@@ -0,0 +1,37 @@
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+
+void apply_rotary_cuda(const torch::Tensor x1, const torch::Tensor x2,
+                       const torch::Tensor cos, const torch::Tensor sin,
+                       torch::Tensor out1, torch::Tensor out2,
+                       const bool conj);
+
+void apply_rotary(const torch::Tensor x1, const torch::Tensor x2,
+                  const torch::Tensor cos, const torch::Tensor sin,
+                  torch::Tensor out1, torch::Tensor out2,
+                  const bool conj) {
+    CHECK_DEVICE(x1); CHECK_DEVICE(x2);
+    CHECK_DEVICE(cos); CHECK_DEVICE(sin);
+    CHECK_DEVICE(out1); CHECK_DEVICE(out1);
+    TORCH_CHECK(x1.dtype() == x2.dtype());
+    TORCH_CHECK(cos.dtype() == sin.dtype());
+    TORCH_CHECK(out1.dtype() == out2.dtype());
+    TORCH_CHECK(x1.dtype() == cos.dtype());
+    TORCH_CHECK(x1.dtype() == out1.dtype());
+    TORCH_CHECK(x1.sizes() == x2.sizes());
+    TORCH_CHECK(cos.sizes() == sin.sizes());
+    TORCH_CHECK(out1.sizes() == out2.sizes());
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)x1.get_device()};
+
+    apply_rotary_cuda(x1, x2, cos, sin, out1, out2, conj);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("apply_rotary", &apply_rotary, "Apply rotary embedding");
+}
diff --git a/csrc/rotary/rotary_cuda.cu b/csrc/rotary/rotary_cuda.cu
@@ -0,0 +1,41 @@
+#include <torch/python.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+
+void apply_rotary_cuda(const torch::Tensor x1, const torch::Tensor x2,
+                       const torch::Tensor cos, const torch::Tensor sin,
+                       torch::Tensor out1, torch::Tensor out2,
+                       const bool conj) {
+    auto iter = at::TensorIteratorConfig()
+        .add_output(out1)
+        .add_output(out2)
+        .add_input(x1)
+        .add_input(x2)
+        .add_input(cos)
+        .add_input(sin)
+        .check_all_same_dtype(false)
+        .promote_inputs_to_common_dtype(false)
+        .build();
+
+    if (!conj) {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] {
+            at::native::gpu_kernel_multiple_outputs(
+                iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos,
+                                    scalar_t sin) -> thrust::tuple<scalar_t, scalar_t> {
+                scalar_t out1 = float(x1) * float(cos) - float(x2) * float(sin);
+                scalar_t out2 = float(x1) * float(sin) + float(x2) * float(cos);
+                return {out1, out2};
+            });
+        });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] {
+            at::native::gpu_kernel_multiple_outputs(
+                iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos,
+                                    scalar_t sin) -> thrust::tuple<scalar_t, scalar_t> {
+                scalar_t out1 = float(x1) * float(cos) + float(x2) * float(sin);
+                scalar_t out2 = -float(x1) * float(sin) + float(x2) * float(cos);
+                return {out1, out2};
+            });
+        });
+    }
+}