Skip to content

Commit

Permalink
Refactor the runner
Browse files Browse the repository at this point in the history
  • Loading branch information
aoyulong committed Oct 14, 2024
1 parent d5ea05a commit f107661
Show file tree
Hide file tree
Showing 27 changed files with 845 additions and 591 deletions.
23 changes: 21 additions & 2 deletions examples/aquila/conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,31 @@ experiment:
backend: megatron
entrypoint: ./flagscale/train/train_aquila.py
runner:
hostfile: /share/project/ayl/FlagScale/hostfile
backend: torchrun
nnodes: 1
nproc_per_node: 8
nnodes: 1
nproc_per_node: 8
cmds:
before_start: source /root/miniconda3/bin/activate flagscale
envs:
CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
CUDA_DEVICE_MAX_CONNECTIONS: 1
# TORCH_LOGS: "+dynamo"
# TORCHDYNAMO_VERBOSE: 1
# CUDA_LAUNCH_BLOCKING: 1
# TORCH_CPP_LOG_LEVEL: INFO
# NCCL_DEBUG: INFO
# TORCH_DISTRIBUTED_DEBUG: DETAIL
# TORCH_SHOW_CPP_STACKTRACES: 1
# NCCL_SOCKET_IFNAME: eth0
# NCCL_IB_DISABLE: 0
# NCCL_IB_CUDA_SUPPORT: 1
# NCCL_IB_GID_INDEX: 0
# NCCL_IB_HCA: mlx5_2,mlx5_5
# NCCL_IB_TIMEOUT: 23
# NCCL_IB_RETRY_CNT: 7
# OMP_NUM_THREADS: 4
# GLOO_SOCKET_IFNAME: eth0

action: run

Expand Down
22 changes: 0 additions & 22 deletions examples/aquila/conf/config_infer.yaml

This file was deleted.

15 changes: 6 additions & 9 deletions examples/aquila/conf/inference/inference_aquila_7b.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
engine:
model: BAAI/Aquila-7B/
tokenizer: BAAI/Aquila-7B/
llm:
model: xxxx
trust_remote_code: true
tensor_parallel_size: 1
pipeline_parallel_size: 1
gpu_memory_utilization: 0.6
dtype: bfloat16
seed: 1234

data:
generate:
prompts: [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# prompts_path: null
top_p: 0.95
top_k: 100
max_tokens: 7
temperature: 0.9
sampling:
top_p: 0.95
temperature: 0.8
23 changes: 14 additions & 9 deletions examples/aquila/conf/train/demo.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
system:
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 2
context_parallel_size: 1
disable_bias_linear: True
use_flash_attn: True
sequence_parallel: True
use_distributed_optimizer: True
use_mcore_models: true
transformer_impl: transformer_engine
precision:
bf16: True
attention_softmax_in_fp32: True
Expand All @@ -15,8 +18,10 @@ system:
wandb_project: "aquila2"
wandb_exp_name: "test"
checkpoint:
save_interval: 1000

ckpt_format: torch_dist
ckpt_fully_parallel_load: true
no_load_rng: true
save_interval: 40

model:
num_layers: 12
Expand Down Expand Up @@ -55,13 +60,13 @@ model:
lr_decay_style: cosine

data:
data_path: ${data_path:??}
data_path: /share/project/ayl/test_data/old_pile/pile_wikipedia_demo
split: 1
tokenizer:
tokenizer_type: null
tokenizer_type: AquilaTokenizerFS
tokenizer_path: null
vocab_file: null
merge_file: null
special_tokens_file: null
vocab_size: null
make_vocab_size_divisible_by: 64
vocab_file: /share/project/ayl/tmp/FlagScale/examples/aquila/tokenizer/vocab.json
merge_file: /share/project/ayl/tmp/FlagScale/examples/aquila/tokenizer/merges.txt
special_tokens_file: /share/project/ayl/tmp/FlagScale/examples/aquila/tokenizer/special_tokens.txt
vocab_size: 100008
make_vocab_size_divisible_by: 64
7 changes: 4 additions & 3 deletions examples/mixtral/conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ defaults:

experiment:
exp_name: mixtral-8x7b
exp_dir: outputs
exp_dir: ./outputs
task:
type: train
backend: megatron
entrypoint: flagscale/train/train_mixtral.py
runner:
backend: torchrun
hostfile: <xxxx>
hostfile: /share/project/ayl/FlagScale/hostfile
cmds:
before_start: source /root/miniconda3/bin/activate flagscale
envs:
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
Expand Down
18 changes: 11 additions & 7 deletions examples/mixtral/conf/train/train_mixtral_8x7b.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
system:
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 4
expert_model_parallel_size: 4
use_mcore_models: true
transformer_impl: transformer_engine
pipeline_model_parallel_size: 2
expert_model_parallel_size: 2
sequence_parallel: true
use_distributed_optimizer: true
precision:
Expand All @@ -19,7 +17,9 @@ system:


model:
num_layers: 32
use_mcore_models: true
transformer_impl: transformer_engine
num_layers: 8
hidden_size: 4096
ffn_hidden_size: 14336
num_attention_heads: 32
Expand Down Expand Up @@ -65,9 +65,13 @@ model:


data:
data_path: <xxxx>
data_path: /share/project/ayl/test_data/pile_wikipedia_demo
split: 1
tokenizer:
tokenizer_type: QwenTokenizerFS
tokenizer_path: <xxxx>
tokenizer_path: /share/project/ayl/tokenizer/qwen
vocab_file: ./examples/aquila/tokenizer/vocab.json
merge_file: ./examples/aquila/tokenizer/vocab.json
special_tokens_file: ./examples/aquila/tokenizer/special_tokens.txt
vocab_size: 151851
make_vocab_size_divisible_by: 64
8 changes: 4 additions & 4 deletions flagscale/auto_tuner/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

from omegaconf import DictConfig, OmegaConf

from flagscale.launcher.job_status import JobStatus
from flagscale.launcher.runner import SSHRunner
from flagscale.runner.runner_base import JobStatus
from flagscale.runner.runner_train import SSHTrainRunner

from .generate import Generator
from .platform import set_jiuding_platform_args
Expand Down Expand Up @@ -160,7 +160,7 @@ def tune(self):
raise ValueError(f"No strategy can run.")
best_task = self.generator.gen_best_task(best_strategy, self.orig_config)
best_task.action = "run"
runner = SSHRunner(best_task)
runner = SSHTrainRunner(best_task)
runner.run(monitor=True, interval=60)

def need_stop(self):
Expand Down Expand Up @@ -213,7 +213,7 @@ def run(self, task=None):
# Instantiate a runner and run the task
if task is None:
task = self.cur_task
self.runner = SSHRunner(task)
self.runner = SSHTrainRunner(task)
self.runner.run()
# set start time
self.task_start_time = time.time()
Expand Down
2 changes: 1 addition & 1 deletion flagscale/auto_tuner/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import subprocess
from types import SimpleNamespace

from flagscale.launcher.runner import parse_hostfile
from flagscale.runner.runner import parse_hostfile


def divisible(x, y):
Expand Down
Loading

0 comments on commit f107661

Please sign in to comment.