Skip to content

Commit

Permalink
fix train ci
Browse files Browse the repository at this point in the history
  • Loading branch information
li126com committed Dec 4, 2024
1 parent 431b5e6 commit 8f551fb
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 23 deletions.
59 changes: 49 additions & 10 deletions tests/test_training/7B_check_acc.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
import os

JOB_NAME = "7b_train"
JOB_NAME = "7b_internlm2_train"
model_type = "INTERNLM2_PUBLIC"
DO_ALERT = False

VOCAB_SIZE = 92544
SEQ_LEN = 2048
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
MLP_RATIO = 8 / 3
NUM_KV_ATTENTION_HEAD = 8
MLP_RATIO = 3.5
NUM_LAYER = 32
VOCAB_SIZE = 103168

MODEL_ONLY_FOLDER = os.path.join(os.environ["share_path"], "quailty_assurance/7B_model_weights_ckpt/init")
MODEL_ONLY_FOLDER = os.path.join(
os.environ["share_path"], "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt/init"
)
# Ckpt folder format:
# fs: 'local:/mnt/nfs/XXX'
# SAVE_CKPT_FOLDER = "local:llm_ckpts_0925_9"
Expand Down Expand Up @@ -121,21 +125,31 @@
)

model = dict(
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
checkpoint=False,
num_chunks=1,
num_attention_heads=NUM_ATTENTION_HEAD,
embed_split_hidden=True,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
hidden_size=HIDDEN_SIZE,
num_layers=NUM_LAYER,
no_bias=True,
mlp_ratio=MLP_RATIO,
apply_post_layer_norm=False,
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
dtype="torch.bfloat16",
norm_type="rmsnorm",
layer_norm_epsilon=1e-5,
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
use_flash_attn=True,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
# Whether the odd and even columns of the query and key in the model are normally interleaved.
# If it's True, the model's odd and even columns are normally ordered; if it's False,
# it means that the model has prematurely concatenated all odd columns and even columns in front
# and back, in order to improve the RoPE's computational efficiency.
# Example:
# qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
# qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
qk_interleaved=False,
)
"""
zero1 parallel:
Expand All @@ -150,9 +164,9 @@
tensor parallel: tensor parallel size, usually the number of GPUs per node.
"""
parallel = dict(
zero1=dict(size=8),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
zero1=dict(size=-1),
tensor=dict(size=2, mode="mtp"),
pipeline=dict(size=2, interleaved_overlap=True),
weight=dict(size=1, overlap=True),
)

Expand All @@ -165,5 +179,30 @@
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
queue_max_length=10,
),
)

# metric_dtype can be "fp32" or other string
# only when set to "fp32" will use fp32 to calc in metrics
# metric_dtype = "fp32"

generation = dict(
ckpt_folder="/path/to/saved/ckpt",
output_folder="/path/to/save/generation",
batch_size=1,
eos_id=[2, 0],
bos_id=1,
max_length=100,
do_sample=True,
temperature=1.0,
top_k=50,
top_p=1.0,
repetition_penalty=1,
length_penalty=1.0,
)

enable_tb = False
49 changes: 43 additions & 6 deletions tests/test_training/7B_check_init.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
JOB_NAME = "7b_train"
JOB_NAME = "7b_internlm2_train"
model_type = "INTERNLM2_PUBLIC"
DO_ALERT = False

VOCAB_SIZE = 92544
SEQ_LEN = 2048
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
MLP_RATIO = 8 / 3
NUM_KV_ATTENTION_HEAD = 8
MLP_RATIO = 3.5
NUM_LAYER = 32
VOCAB_SIZE = 103168

CHECK_INIT = 1

Expand Down Expand Up @@ -128,21 +130,31 @@
)

model = dict(
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
checkpoint=False,
num_chunks=1,
num_attention_heads=NUM_ATTENTION_HEAD,
embed_split_hidden=True,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
hidden_size=HIDDEN_SIZE,
num_layers=NUM_LAYER,
no_bias=True,
mlp_ratio=MLP_RATIO,
apply_post_layer_norm=False,
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
dtype="torch.bfloat16",
norm_type="rmsnorm",
layer_norm_epsilon=1e-5,
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
use_flash_attn=True,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
# Whether the odd and even columns of the query and key in the model are normally interleaved.
# If it's True, the model's odd and even columns are normally ordered; if it's False,
# it means that the model has prematurely concatenated all odd columns and even columns in front
# and back, in order to improve the RoPE's computational efficiency.
# Example:
# qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
# qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
qk_interleaved=False,
)

parallel = dict(
Expand All @@ -161,5 +173,30 @@
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
queue_max_length=10,
),
)

# metric_dtype can be "fp32" or other string
# only when set to "fp32" will use fp32 to calc in metrics
# metric_dtype = "fp32"

generation = dict(
ckpt_folder="/path/to/saved/ckpt",
output_folder="/path/to/save/generation",
batch_size=1,
eos_id=[2, 0],
bos_id=1,
max_length=100,
do_sample=True,
temperature=1.0,
top_k=50,
top_p=1.0,
repetition_penalty=1,
length_penalty=1.0,
)

enable_tb = False
21 changes: 14 additions & 7 deletions tests/test_training/train_CI.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from internlm.checkpoint import CheckpointManager # noqa: E402
from internlm.core.context import ParallelMode # noqa: E402
from internlm.core.context import global_context as gpc # noqa: E402
from internlm.core.trainer import TrainState, Trainer # noqa: E402
from internlm.core.trainer import Trainer, TrainState # noqa: E402
from internlm.data import ( # noqa: E402
build_train_loader_with_data_type,
build_valid_loader_with_data_type,
Expand Down Expand Up @@ -60,6 +60,7 @@


def check_model_weights(model, ckpt_path, total_equal=False):
model = model.model
model1_dict = torch.load(ckpt_path, map_location="cuda")
model2_dict = model.state_dict()

Expand Down Expand Up @@ -214,13 +215,14 @@ def main(args):
# check model init weights
if hasattr(gpc.config, "CHECK_INIT") and gpc.config.CHECK_INIT == 1:
ckpt_name = (
f"model_dp{gpc.get_local_rank(ParallelMode.DATA)}"
f"model"
f"_tp{gpc.get_local_rank(ParallelMode.TENSOR)}"
f"_pp{gpc.get_local_rank(ParallelMode.PIPELINE)}.pt"
)
ckpt_path = os.path.join(os.environ["share_path"], "quailty_assurance/7B_init_dp=2_tp=2_pp=2_ckpt", ckpt_name)
ckpt_path = os.path.join(
os.environ["share_path"], "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt/init", ckpt_name
)
check_model_weights(model, ckpt_path, total_equal=True)

with initialize_llm_profile(profiling=args.profiling, start_time=current_time) as prof:
# start iterating the train data and begin training
for batch_count in range(train_state.batch_count, total_steps):
Expand Down Expand Up @@ -327,12 +329,17 @@ def main(args):
)

# check model weights
if gpc.is_rank_for_log() and batch_count > 0 and batch_count % 100 == 0:
if batch_count > 0 and batch_count % 100 == 0:
ckpt_name = (
f"model"
f"_tp{gpc.get_local_rank(ParallelMode.TENSOR)}"
f"_pp{gpc.get_local_rank(ParallelMode.PIPELINE)}.pt"
)
ckpt_path = os.path.join(
os.environ["share_path"],
"quailty_assurance/7B_model_weights_ckpt",
"quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt",
str(batch_count),
"model_tp0_pp0.pt",
ckpt_name,
)
check_model_weights(model, ckpt_path)

Expand Down

0 comments on commit 8f551fb

Please sign in to comment.