diff --git a/tests/test_training/7B_check_acc.py b/tests/test_training/7B_check_acc.py index 3b727d7c..cb3902bc 100644 --- a/tests/test_training/7B_check_acc.py +++ b/tests/test_training/7B_check_acc.py @@ -1,16 +1,20 @@ import os -JOB_NAME = "7b_train" +JOB_NAME = "7b_internlm2_train" +model_type = "INTERNLM2_PUBLIC" DO_ALERT = False +VOCAB_SIZE = 92544 SEQ_LEN = 2048 HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 +NUM_KV_ATTENTION_HEAD = 8 +MLP_RATIO = 3.5 NUM_LAYER = 32 -VOCAB_SIZE = 103168 -MODEL_ONLY_FOLDER = os.path.join(os.environ["share_path"], "quailty_assurance/7B_model_weights_ckpt/init") +MODEL_ONLY_FOLDER = os.path.join( + os.environ["share_path"], "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt/init" +) # Ckpt folder format: # fs: 'local:/mnt/nfs/XXX' # SAVE_CKPT_FOLDER = "local:llm_ckpts_0925_9" @@ -121,7 +125,8 @@ ) model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + checkpoint=False, + num_chunks=1, num_attention_heads=NUM_ATTENTION_HEAD, embed_split_hidden=True, vocab_size=VOCAB_SIZE, @@ -129,13 +134,22 @@ parallel_output=True, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYER, + no_bias=True, mlp_ratio=MLP_RATIO, apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + dtype="torch.bfloat16", norm_type="rmsnorm", layer_norm_epsilon=1e-5, + num_kv_attention_heads=NUM_KV_ATTENTION_HEAD, use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. + # Whether the odd and even columns of the query and key in the model are normally interleaved. + # If it's True, the model's odd and even columns are normally ordered; if it's False, + # it means that the model has prematurely concatenated all odd columns and even columns in front + # and back, in order to improve the RoPE's computational efficiency. + # Example: + # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...] + # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...] + qk_interleaved=False, ) """ zero1 parallel: @@ -150,9 +164,9 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node. """ parallel = dict( - zero1=dict(size=8), - tensor=dict(size=1, mode="mtp"), - pipeline=dict(size=1, interleaved_overlap=True), + zero1=dict(size=-1), + tensor=dict(size=2, mode="mtp"), + pipeline=dict(size=2, interleaved_overlap=True), weight=dict(size=1, overlap=True), ) @@ -165,5 +179,30 @@ enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message light_monitor_address=None, # light_monitor address to send heartbeat + alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", + ), + tensorboard=dict( + queue_max_length=10, ), ) + +# metric_dtype can be "fp32" or other string +# only when set to "fp32" will use fp32 to calc in metrics +# metric_dtype = "fp32" + +generation = dict( + ckpt_folder="/path/to/saved/ckpt", + output_folder="/path/to/save/generation", + batch_size=1, + eos_id=[2, 0], + bos_id=1, + max_length=100, + do_sample=True, + temperature=1.0, + top_k=50, + top_p=1.0, + repetition_penalty=1, + length_penalty=1.0, +) + +enable_tb = False diff --git a/tests/test_training/7B_check_init.py b/tests/test_training/7B_check_init.py index 6f72c7d7..03107d02 100644 --- a/tests/test_training/7B_check_init.py +++ b/tests/test_training/7B_check_init.py @@ -1,12 +1,14 @@ -JOB_NAME = "7b_train" +JOB_NAME = "7b_internlm2_train" +model_type = "INTERNLM2_PUBLIC" DO_ALERT = False +VOCAB_SIZE = 92544 SEQ_LEN = 2048 HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 +NUM_KV_ATTENTION_HEAD = 8 +MLP_RATIO = 3.5 NUM_LAYER = 32 -VOCAB_SIZE = 103168 CHECK_INIT = 1 @@ -128,7 +130,8 @@ ) model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + checkpoint=False, + num_chunks=1, num_attention_heads=NUM_ATTENTION_HEAD, embed_split_hidden=True, vocab_size=VOCAB_SIZE, @@ -136,13 +139,22 @@ parallel_output=True, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYER, + no_bias=True, mlp_ratio=MLP_RATIO, apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + dtype="torch.bfloat16", norm_type="rmsnorm", layer_norm_epsilon=1e-5, + num_kv_attention_heads=NUM_KV_ATTENTION_HEAD, use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. + # Whether the odd and even columns of the query and key in the model are normally interleaved. + # If it's True, the model's odd and even columns are normally ordered; if it's False, + # it means that the model has prematurely concatenated all odd columns and even columns in front + # and back, in order to improve the RoPE's computational efficiency. + # Example: + # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...] + # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...] + qk_interleaved=False, ) parallel = dict( @@ -161,5 +173,30 @@ enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message light_monitor_address=None, # light_monitor address to send heartbeat + alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", + ), + tensorboard=dict( + queue_max_length=10, ), ) + +# metric_dtype can be "fp32" or other string +# only when set to "fp32" will use fp32 to calc in metrics +# metric_dtype = "fp32" + +generation = dict( + ckpt_folder="/path/to/saved/ckpt", + output_folder="/path/to/save/generation", + batch_size=1, + eos_id=[2, 0], + bos_id=1, + max_length=100, + do_sample=True, + temperature=1.0, + top_k=50, + top_p=1.0, + repetition_penalty=1, + length_penalty=1.0, +) + +enable_tb = False diff --git a/tests/test_training/train_CI.py b/tests/test_training/train_CI.py index b33cf4c3..7926bae5 100644 --- a/tests/test_training/train_CI.py +++ b/tests/test_training/train_CI.py @@ -20,7 +20,7 @@ from internlm.checkpoint import CheckpointManager # noqa: E402 from internlm.core.context import ParallelMode # noqa: E402 from internlm.core.context import global_context as gpc # noqa: E402 -from internlm.core.trainer import TrainState, Trainer # noqa: E402 +from internlm.core.trainer import Trainer, TrainState # noqa: E402 from internlm.data import ( # noqa: E402 build_train_loader_with_data_type, build_valid_loader_with_data_type, @@ -60,6 +60,7 @@ def check_model_weights(model, ckpt_path, total_equal=False): + model = model.model model1_dict = torch.load(ckpt_path, map_location="cuda") model2_dict = model.state_dict() @@ -214,13 +215,14 @@ def main(args): # check model init weights if hasattr(gpc.config, "CHECK_INIT") and gpc.config.CHECK_INIT == 1: ckpt_name = ( - f"model_dp{gpc.get_local_rank(ParallelMode.DATA)}" + f"model" f"_tp{gpc.get_local_rank(ParallelMode.TENSOR)}" f"_pp{gpc.get_local_rank(ParallelMode.PIPELINE)}.pt" ) - ckpt_path = os.path.join(os.environ["share_path"], "quailty_assurance/7B_init_dp=2_tp=2_pp=2_ckpt", ckpt_name) + ckpt_path = os.path.join( + os.environ["share_path"], "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt/init", ckpt_name + ) check_model_weights(model, ckpt_path, total_equal=True) - with initialize_llm_profile(profiling=args.profiling, start_time=current_time) as prof: # start iterating the train data and begin training for batch_count in range(train_state.batch_count, total_steps): @@ -327,12 +329,17 @@ def main(args): ) # check model weights - if gpc.is_rank_for_log() and batch_count > 0 and batch_count % 100 == 0: + if batch_count > 0 and batch_count % 100 == 0: + ckpt_name = ( + f"model" + f"_tp{gpc.get_local_rank(ParallelMode.TENSOR)}" + f"_pp{gpc.get_local_rank(ParallelMode.PIPELINE)}.pt" + ) ckpt_path = os.path.join( os.environ["share_path"], - "quailty_assurance/7B_model_weights_ckpt", + "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt", str(batch_count), - "model_tp0_pp0.pt", + ckpt_name, ) check_model_weights(model, ckpt_path)