Skip to content

Commit

Permalink
merge develop
Browse files Browse the repository at this point in the history
  • Loading branch information
li126com committed Jul 16, 2024
2 parents e04712e + aa3e9c4 commit b930e6b
Show file tree
Hide file tree
Showing 153 changed files with 12,938 additions and 6,448 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/monthly_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 8 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_msp" ./tests/test_training/test_norm_weight.py
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_msp" ./tests/test_training/test_norm_weight.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
Expand All @@ -42,7 +42,7 @@ jobs:
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 8 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_fsp" ./tests/test_training/test_norm_weight.py
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_fsp" ./tests/test_training/test_norm_weight.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
Expand All @@ -61,7 +61,7 @@ jobs:
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 8 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_isp" ./tests/test_training/test_norm_weight.py
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_isp" ./tests/test_training/test_norm_weight.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
Expand Down
24 changes: 19 additions & 5 deletions .github/workflows/upload_to_pypi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,31 @@ jobs:
run: |
pip install setuptools wheel twine
- name: get latest tag
run: |
latest_tag=$(git describe --tags --abbrev=0)
echo "$latest_tag" > version.txt
- name: build and upload package
run: |
source activate ${evo_env_torch21_flash2}
python_path=$(which python) && echo "Python executable is at: $python_path"
latest_tag=$(git describe --tags --abbrev=0)
echo "$latest_tag" > version.txt
export PYTHONPATH=$PWD:$PYTHONPATH
export LLMPLATFORM=/mnt/petrelfs/share_data/llm_env
export CUDA_PATH=${LLMPLATFORM}/dep/cuda-11.8
export GCC_HOME=${LLMPLATFORM}/dep/gcc-10.2.0
export MPFR_HOME=${LLMPLATFORM}/dep/mpfr-4.1.0
export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${CUDA_PATH}/extras/CUPTI/lib64/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=${GCC_HOME}/lib64:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
export CC=${GCC_HOME}/bin/gcc
export CXX=${GCC_HOME}/bin/c++
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
cd csrc/rotary/
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
cd ../xentropy/
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
cd ../../
exit_code=$?
twine upload -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} dist/*
twine upload -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} csrc/rotary/dist/*
twine upload -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} csrc/xentropy/dist/*
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
4 changes: 4 additions & 0 deletions .github/workflows/weekly_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ jobs:
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_8GPU_ISP
run: |
Expand All @@ -195,6 +197,8 @@ jobs:
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_8GPU_ISP_CKPT
run: |
Expand Down
8 changes: 8 additions & 0 deletions configs/7B_MoE4_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@
norm_type="rmsnorm",
layer_norm_epsilon=1e-5,
use_flash_attn=True,
# Whether the odd and even columns of the query and key in the model are normally interleaved.
# If it's True, the model's odd and even columns are normally ordered; if it's False,
# it means that the model has prematurely concatenated all odd columns and even columns in front
# and back, in order to improve the RoPE's computational efficiency.
# Example:
# qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
# qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
qk_interleaved=False,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
num_experts=4,
moe_use_residual=False,
Expand Down
25 changes: 24 additions & 1 deletion configs/7B_internlm2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
JOB_NAME = "7b_internlm2_train"
model_type="INTERNLM2_PUBLIC"
model_type = "INTERNLM2_PUBLIC"
DO_ALERT = False

VOCAB_SIZE = 92544
Expand Down Expand Up @@ -144,6 +144,14 @@
layer_norm_epsilon=1e-5,
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
use_flash_attn=True,
# Whether the odd and even columns of the query and key in the model are normally interleaved.
# If it's True, the model's odd and even columns are normally ordered; if it's False,
# it means that the model has prematurely concatenated all odd columns and even columns in front
# and back, in order to improve the RoPE's computational efficiency.
# Example:
# qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
# qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
qk_interleaved=False,
)

"""
Expand Down Expand Up @@ -197,3 +205,18 @@
# metric_dtype can be "fp32" or other string
# only when set to "fp32" will use fp32 to calc in metrics
# metric_dtype = "fp32"

generation = dict(
ckpt_folder="/path/to/saved/ckpt",
output_folder="/path/to/save/generation",
batch_size=1,
eos_id=[2, 0],
bos_id=1,
max_length=100,
do_sample=True,
temperature=1.0,
top_k=50,
top_p=1.0,
repetition_penalty=1,
length_penalty=1.0,
)
8 changes: 8 additions & 0 deletions configs/7B_isp_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@
layer_norm_epsilon=1e-5,
use_flash_attn=True,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
# Whether the odd and even columns of the query and key in the model are normally interleaved.
# If it's True, the model's odd and even columns are normally ordered; if it's False,
# it means that the model has prematurely concatenated all odd columns and even columns in front
# and back, in order to improve the RoPE's computational efficiency.
# Example:
# qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
# qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
qk_interleaved=False,
)
"""
zero1 parallel (dict):
Expand Down
12 changes: 10 additions & 2 deletions configs/7B_llama2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
SEQ_LEN = 2048
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
NUM_KV_ATTENTION_HEAD = 8
MLP_RATIO = 3.5
NUM_KV_ATTENTION_HEAD = 32
MLP_RATIO = 2.6875
NUM_LAYER = 32


Expand Down Expand Up @@ -144,6 +144,14 @@
layer_norm_epsilon=1e-5,
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
use_flash_attn=True,
# Whether the odd and even columns of the query and key in the model are normally interleaved.
# If it's True, the model's odd and even columns are normally ordered; if it's False,
# it means that the model has prematurely concatenated all odd columns and even columns in front
# and back, in order to improve the RoPE's computational efficiency.
# Example:
# qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
# qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
qk_interleaved=False,
)

"""
Expand Down
17 changes: 17 additions & 0 deletions configs/7B_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@
valid_folder=VALID_FOLDER,
empty_cache_and_diag_interval=200,
diag_outlier_ratio=1.1,
# whether use shared memory to load meta files
use_shm=False,
# when use shm, the default shm_path is "/dev/shm/metacache"
# shm_path="/dev/shm/metacache"
)

grad_scaler = dict(
Expand Down Expand Up @@ -100,6 +104,11 @@
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
# whether use new optm
use_split_tensor_optim=False,
# when use split tensor optm
# Perform all gather with a set of parameters of all_gather_size
all_gather_size=512 * 1024 * 1024,
)

loss = dict(
Expand Down Expand Up @@ -145,6 +154,14 @@
norm_type="rmsnorm",
layer_norm_epsilon=1e-5,
use_flash_attn=True,
# Whether the odd and even columns of the query and key in the model are normally interleaved.
# If it's True, the model's odd and even columns are normally ordered; if it's False,
# it means that the model has prematurely concatenated all odd columns and even columns in front
# and back, in order to improve the RoPE's computational efficiency.
# Example:
# qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
# qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
qk_interleaved=False,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
)
"""
Expand Down
2 changes: 1 addition & 1 deletion configs/_base_/models/internlm2_1B.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
mlp_ratio=MLP_RATIO,
multiple_of=MULTIPLE_OF,
norm_type="rmsnorm",
adapt_hf=True,
qk_interleaved=False,
apply_post_layer_norm=False,
no_bias=True,
layer_norm_epsilon=1e-5,
Expand Down
2 changes: 1 addition & 1 deletion configs/_base_/models/internlm2_20B.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
mlp_ratio=MLP_RATIO,
norm_type="rmsnorm",
adapt_hf=True,
qk_interleaved=False,
apply_post_layer_norm=False,
no_bias=True,
layer_norm_epsilon=1e-5,
Expand Down
2 changes: 1 addition & 1 deletion configs/_base_/models/internlm2_7B.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
mlp_ratio=MLP_RATIO,
norm_type="rmsnorm",
adapt_hf=False,
qk_interleaved=True,
apply_post_layer_norm=False,
no_bias=True,
layer_norm_epsilon=1e-5,
Expand Down
37 changes: 37 additions & 0 deletions csrc/rotary/rotary.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>

#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA")
#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")

void apply_rotary_cuda(const torch::Tensor x1, const torch::Tensor x2,
const torch::Tensor cos, const torch::Tensor sin,
torch::Tensor out1, torch::Tensor out2,
const bool conj);

void apply_rotary(const torch::Tensor x1, const torch::Tensor x2,
const torch::Tensor cos, const torch::Tensor sin,
torch::Tensor out1, torch::Tensor out2,
const bool conj) {
CHECK_DEVICE(x1); CHECK_DEVICE(x2);
CHECK_DEVICE(cos); CHECK_DEVICE(sin);
CHECK_DEVICE(out1); CHECK_DEVICE(out1);
TORCH_CHECK(x1.dtype() == x2.dtype());
TORCH_CHECK(cos.dtype() == sin.dtype());
TORCH_CHECK(out1.dtype() == out2.dtype());
TORCH_CHECK(x1.dtype() == cos.dtype());
TORCH_CHECK(x1.dtype() == out1.dtype());
TORCH_CHECK(x1.sizes() == x2.sizes());
TORCH_CHECK(cos.sizes() == sin.sizes());
TORCH_CHECK(out1.sizes() == out2.sizes());

// Otherwise the kernel will be launched from cuda:0 device
// Cast to char to avoid compiler warning about narrowing
at::cuda::CUDAGuard device_guard{(char)x1.get_device()};

apply_rotary_cuda(x1, x2, cos, sin, out1, out2, conj);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("apply_rotary", &apply_rotary, "Apply rotary embedding");
}
41 changes: 41 additions & 0 deletions csrc/rotary/rotary_cuda.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include <torch/python.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cuda/Loops.cuh>

void apply_rotary_cuda(const torch::Tensor x1, const torch::Tensor x2,
const torch::Tensor cos, const torch::Tensor sin,
torch::Tensor out1, torch::Tensor out2,
const bool conj) {
auto iter = at::TensorIteratorConfig()
.add_output(out1)
.add_output(out2)
.add_input(x1)
.add_input(x2)
.add_input(cos)
.add_input(sin)
.check_all_same_dtype(false)
.promote_inputs_to_common_dtype(false)
.build();

if (!conj) {
AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] {
at::native::gpu_kernel_multiple_outputs(
iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos,
scalar_t sin) -> thrust::tuple<scalar_t, scalar_t> {
scalar_t out1 = float(x1) * float(cos) - float(x2) * float(sin);
scalar_t out2 = float(x1) * float(sin) + float(x2) * float(cos);
return {out1, out2};
});
});
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] {
at::native::gpu_kernel_multiple_outputs(
iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos,
scalar_t sin) -> thrust::tuple<scalar_t, scalar_t> {
scalar_t out1 = float(x1) * float(cos) + float(x2) * float(sin);
scalar_t out2 = -float(x1) * float(sin) + float(x2) * float(cos);
return {out1, out2};
});
});
}
}
Loading

0 comments on commit b930e6b

Please sign in to comment.