fix train ci

li126com · Dec 4, 2024 · 8f551fb · 8f551fb
1 parent 431b5e6
commit 8f551fb
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 23 deletions.
diff --git a/tests/test_training/7B_check_acc.py b/tests/test_training/7B_check_acc.py
@@ -1,16 +1,20 @@
 import os
 
-JOB_NAME = "7b_train"
+JOB_NAME = "7b_internlm2_train"
+model_type = "INTERNLM2_PUBLIC"
 DO_ALERT = False
 
+VOCAB_SIZE = 92544
 SEQ_LEN = 2048
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
+NUM_KV_ATTENTION_HEAD = 8
+MLP_RATIO = 3.5
 NUM_LAYER = 32
-VOCAB_SIZE = 103168
 
-MODEL_ONLY_FOLDER = os.path.join(os.environ["share_path"], "quailty_assurance/7B_model_weights_ckpt/init")
+MODEL_ONLY_FOLDER = os.path.join(
+    os.environ["share_path"], "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt/init"
+)
 # Ckpt folder format:
 # fs: 'local:/mnt/nfs/XXX'
 # SAVE_CKPT_FOLDER = "local:llm_ckpts_0925_9"
@@ -121,21 +125,31 @@
 )
 
 model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    checkpoint=False,
+    num_chunks=1,
     num_attention_heads=NUM_ATTENTION_HEAD,
     embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
     embed_grad_scale=1,
     parallel_output=True,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
+    no_bias=True,
     mlp_ratio=MLP_RATIO,
     apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    dtype="torch.bfloat16",
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
+    num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
     use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
 )
 """
 zero1 parallel:
@@ -150,9 +164,9 @@
 tensor parallel: tensor parallel size, usually the number of GPUs per node.
 """
 parallel = dict(
-    zero1=dict(size=8),
-    tensor=dict(size=1, mode="mtp"),
-    pipeline=dict(size=1, interleaved_overlap=True),
+    zero1=dict(size=-1),
+    tensor=dict(size=2, mode="mtp"),
+    pipeline=dict(size=2, interleaved_overlap=True),
     weight=dict(size=1, overlap=True),
 )
 
@@ -165,5 +179,30 @@
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
         light_monitor_address=None,  # light_monitor address to send heartbeat
+        alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
+    ),
+    tensorboard=dict(
+        queue_max_length=10,
     ),
 )
+
+# metric_dtype can be "fp32" or other string
+# only when set to "fp32" will use fp32 to calc in metrics
+# metric_dtype = "fp32"
+
+generation = dict(
+    ckpt_folder="/path/to/saved/ckpt",
+    output_folder="/path/to/save/generation",
+    batch_size=1,
+    eos_id=[2, 0],
+    bos_id=1,
+    max_length=100,
+    do_sample=True,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    repetition_penalty=1,
+    length_penalty=1.0,
+)
+
+enable_tb = False
diff --git a/tests/test_training/7B_check_init.py b/tests/test_training/7B_check_init.py
@@ -1,12 +1,14 @@
-JOB_NAME = "7b_train"
+JOB_NAME = "7b_internlm2_train"
+model_type = "INTERNLM2_PUBLIC"
 DO_ALERT = False
 
+VOCAB_SIZE = 92544
 SEQ_LEN = 2048
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
+NUM_KV_ATTENTION_HEAD = 8
+MLP_RATIO = 3.5
 NUM_LAYER = 32
-VOCAB_SIZE = 103168
 
 CHECK_INIT = 1
 
@@ -128,21 +130,31 @@
 )
 
 model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    checkpoint=False,
+    num_chunks=1,
     num_attention_heads=NUM_ATTENTION_HEAD,
     embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
     embed_grad_scale=1,
     parallel_output=True,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
+    no_bias=True,
     mlp_ratio=MLP_RATIO,
     apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    dtype="torch.bfloat16",
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
+    num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
     use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
 )
 
 parallel = dict(
@@ -161,5 +173,30 @@
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
         light_monitor_address=None,  # light_monitor address to send heartbeat
+        alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
+    ),
+    tensorboard=dict(
+        queue_max_length=10,
     ),
 )
+
+# metric_dtype can be "fp32" or other string
+# only when set to "fp32" will use fp32 to calc in metrics
+# metric_dtype = "fp32"
+
+generation = dict(
+    ckpt_folder="/path/to/saved/ckpt",
+    output_folder="/path/to/save/generation",
+    batch_size=1,
+    eos_id=[2, 0],
+    bos_id=1,
+    max_length=100,
+    do_sample=True,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    repetition_penalty=1,
+    length_penalty=1.0,
+)
+
+enable_tb = False
diff --git a/tests/test_training/train_CI.py b/tests/test_training/train_CI.py
@@ -20,7 +20,7 @@
 from internlm.checkpoint import CheckpointManager  # noqa: E402
 from internlm.core.context import ParallelMode  # noqa: E402
 from internlm.core.context import global_context as gpc  # noqa: E402
-from internlm.core.trainer import TrainState, Trainer  # noqa: E402
+from internlm.core.trainer import Trainer, TrainState  # noqa: E402
 from internlm.data import (  # noqa: E402
     build_train_loader_with_data_type,
     build_valid_loader_with_data_type,
@@ -60,6 +60,7 @@
 
 
 def check_model_weights(model, ckpt_path, total_equal=False):
+    model = model.model
     model1_dict = torch.load(ckpt_path, map_location="cuda")
     model2_dict = model.state_dict()
 
@@ -214,13 +215,14 @@ def main(args):
     # check model init weights
     if hasattr(gpc.config, "CHECK_INIT") and gpc.config.CHECK_INIT == 1:
         ckpt_name = (
-            f"model_dp{gpc.get_local_rank(ParallelMode.DATA)}"
+            f"model"
             f"_tp{gpc.get_local_rank(ParallelMode.TENSOR)}"
             f"_pp{gpc.get_local_rank(ParallelMode.PIPELINE)}.pt"
         )
-        ckpt_path = os.path.join(os.environ["share_path"], "quailty_assurance/7B_init_dp=2_tp=2_pp=2_ckpt", ckpt_name)
+        ckpt_path = os.path.join(
+            os.environ["share_path"], "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt/init", ckpt_name
+        )
         check_model_weights(model, ckpt_path, total_equal=True)
-
     with initialize_llm_profile(profiling=args.profiling, start_time=current_time) as prof:
         # start iterating the train data and begin training
         for batch_count in range(train_state.batch_count, total_steps):
@@ -327,12 +329,17 @@ def main(args):
                 )
 
             # check model weights
-            if gpc.is_rank_for_log() and batch_count > 0 and batch_count % 100 == 0:
+            if batch_count > 0 and batch_count % 100 == 0:
+                ckpt_name = (
+                    f"model"
+                    f"_tp{gpc.get_local_rank(ParallelMode.TENSOR)}"
+                    f"_pp{gpc.get_local_rank(ParallelMode.PIPELINE)}.pt"
+                )
                 ckpt_path = os.path.join(
                     os.environ["share_path"],
-                    "quailty_assurance/7B_model_weights_ckpt",
+                    "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt",
                     str(batch_count),
-                    "model_tp0_pp0.pt",
+                    ckpt_name,
                 )
                 check_model_weights(model, ckpt_path)