Refactor the runner

aoyulong · Oct 14, 2024 · f107661 · f107661
1 parent d5ea05a
commit f107661
Show file tree

Hide file tree

Showing 27 changed files with 845 additions and 591 deletions.
diff --git a/examples/aquila/conf/config.yaml b/examples/aquila/conf/config.yaml
@@ -10,12 +10,31 @@ experiment:
     backend: megatron
     entrypoint: ./flagscale/train/train_aquila.py
   runner:
+    hostfile: /share/project/ayl/FlagScale/hostfile
     backend: torchrun
-    nnodes: 1 
-    nproc_per_node: 8 
+    nnodes: 1
+    nproc_per_node: 8
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale
   envs:
     CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
     CUDA_DEVICE_MAX_CONNECTIONS: 1
+    # TORCH_LOGS: "+dynamo"
+    # TORCHDYNAMO_VERBOSE: 1
+    # CUDA_LAUNCH_BLOCKING: 1
+    # TORCH_CPP_LOG_LEVEL: INFO
+    # NCCL_DEBUG: INFO 
+    # TORCH_DISTRIBUTED_DEBUG: DETAIL
+    # TORCH_SHOW_CPP_STACKTRACES: 1
+    # NCCL_SOCKET_IFNAME: eth0
+    # NCCL_IB_DISABLE: 0
+    # NCCL_IB_CUDA_SUPPORT: 1
+    # NCCL_IB_GID_INDEX: 0
+    # NCCL_IB_HCA: mlx5_2,mlx5_5
+    # NCCL_IB_TIMEOUT: 23
+    # NCCL_IB_RETRY_CNT: 7
+    # OMP_NUM_THREADS: 4
+    # GLOO_SOCKET_IFNAME: eth0
 
 action: run
 

diff --git a/examples/aquila/conf/config_infer.yaml b/examples/aquila/conf/config_infer.yaml
diff --git a/examples/aquila/conf/inference/inference_aquila_7b.yaml b/examples/aquila/conf/inference/inference_aquila_7b.yaml
@@ -1,22 +1,19 @@
-engine:
-  model: BAAI/Aquila-7B/
-  tokenizer: BAAI/Aquila-7B/
+llm:
+  model: xxxx 
   trust_remote_code: true
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
   gpu_memory_utilization: 0.6
   dtype: bfloat16
   seed: 1234
 
-data:
+generate:
   prompts: [
     "Hello, my name is",
     "The president of the United States is",
     "The capital of France is",
     "The future of AI is",
   ]
-  # prompts_path: null
-  top_p: 0.95
-  top_k: 100
-  max_tokens: 7
-  temperature: 0.9
+  sampling:
+    top_p: 0.95
+    temperature: 0.8
diff --git a/examples/aquila/conf/train/demo.yaml b/examples/aquila/conf/train/demo.yaml
@@ -1,10 +1,13 @@
 system:
   tensor_model_parallel_size: 2
   pipeline_model_parallel_size: 2
+  context_parallel_size: 1
   disable_bias_linear: True
   use_flash_attn: True
   sequence_parallel: True
   use_distributed_optimizer: True
+  use_mcore_models: true
+  transformer_impl: transformer_engine
   precision:
     bf16: True
     attention_softmax_in_fp32: True
@@ -15,8 +18,10 @@ system:
     wandb_project: "aquila2" 
     wandb_exp_name: "test" 
   checkpoint:
-    save_interval: 1000 
-
+    ckpt_format: torch_dist
+    ckpt_fully_parallel_load: true
+    no_load_rng: true 
+    save_interval: 40 
 
 model:
   num_layers: 12 
@@ -55,13 +60,13 @@ model:
       lr_decay_style: cosine 
 
 data:
-  data_path: ${data_path:??}
+  data_path: /share/project/ayl/test_data/old_pile/pile_wikipedia_demo
   split: 1
   tokenizer:
-    tokenizer_type: null
+    tokenizer_type: AquilaTokenizerFS
     tokenizer_path: null 
-    vocab_file: null 
-    merge_file: null 
-    special_tokens_file: null 
-    vocab_size: null
-    make_vocab_size_divisible_by: 64
+    vocab_file: /share/project/ayl/tmp/FlagScale/examples/aquila/tokenizer/vocab.json
+    merge_file: /share/project/ayl/tmp/FlagScale/examples/aquila/tokenizer/merges.txt 
+    special_tokens_file: /share/project/ayl/tmp/FlagScale/examples/aquila/tokenizer/special_tokens.txt 
+    vocab_size: 100008 
+    make_vocab_size_divisible_by: 64
diff --git a/examples/mixtral/conf/config.yaml b/examples/mixtral/conf/config.yaml
@@ -4,14 +4,15 @@ defaults:
 
 experiment:
   exp_name: mixtral-8x7b
-  exp_dir: outputs
+  exp_dir: ./outputs
   task:
     type: train
     backend: megatron
     entrypoint: flagscale/train/train_mixtral.py 
   runner:
-    backend: torchrun
-    hostfile: <xxxx>
+    hostfile: /share/project/ayl/FlagScale/hostfile
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale
   envs:
     CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
     CUDA_DEVICE_MAX_CONNECTIONS: 1

diff --git a/examples/mixtral/conf/train/train_mixtral_8x7b.yaml b/examples/mixtral/conf/train/train_mixtral_8x7b.yaml
@@ -1,9 +1,7 @@
 system:
   tensor_model_parallel_size: 2
-  pipeline_model_parallel_size: 4
-  expert_model_parallel_size: 4
-  use_mcore_models: true
-  transformer_impl: transformer_engine
+  pipeline_model_parallel_size: 2
+  expert_model_parallel_size: 2
   sequence_parallel: true
   use_distributed_optimizer: true
   precision:
@@ -19,7 +17,9 @@ system:
 
 
 model:
-  num_layers: 32
+  use_mcore_models: true
+  transformer_impl: transformer_engine
+  num_layers: 8 
   hidden_size: 4096
   ffn_hidden_size: 14336
   num_attention_heads: 32
@@ -65,9 +65,13 @@ model:
 
 
 data:
-  data_path: <xxxx>
+  data_path: /share/project/ayl/test_data/pile_wikipedia_demo
   split: 1
   tokenizer:
     tokenizer_type: QwenTokenizerFS
-    tokenizer_path: <xxxx>
+    tokenizer_path: /share/project/ayl/tokenizer/qwen 
+    vocab_file: ./examples/aquila/tokenizer/vocab.json
+    merge_file: ./examples/aquila/tokenizer/vocab.json
+    special_tokens_file: ./examples/aquila/tokenizer/special_tokens.txt
+    vocab_size: 151851
     make_vocab_size_divisible_by: 64
diff --git a/flagscale/auto_tuner/tuner.py b/flagscale/auto_tuner/tuner.py
@@ -6,8 +6,8 @@
 
 from omegaconf import DictConfig, OmegaConf
 
-from flagscale.launcher.job_status import JobStatus
-from flagscale.launcher.runner import SSHRunner
+from flagscale.runner.runner_base import JobStatus
+from flagscale.runner.runner_train import SSHTrainRunner
 
 from .generate import Generator
 from .platform import set_jiuding_platform_args
@@ -160,7 +160,7 @@ def tune(self):
                 raise ValueError(f"No strategy can run.")
             best_task = self.generator.gen_best_task(best_strategy, self.orig_config)
             best_task.action = "run"
-            runner = SSHRunner(best_task)
+            runner = SSHTrainRunner(best_task)
             runner.run(monitor=True, interval=60)
 
     def need_stop(self):
@@ -213,7 +213,7 @@ def run(self, task=None):
         # Instantiate a runner and run the task
         if task is None:
             task = self.cur_task
-        self.runner = SSHRunner(task)
+        self.runner = SSHTrainRunner(task)
         self.runner.run()
         # set start time
         self.task_start_time = time.time()

diff --git a/flagscale/auto_tuner/utils.py b/flagscale/auto_tuner/utils.py
@@ -4,7 +4,7 @@
 import subprocess
 from types import SimpleNamespace
 
-from flagscale.launcher.runner import parse_hostfile
+from flagscale.runner.runner import parse_hostfile
 
 
 def divisible(x, y):