Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions dev/ms-swift-megatron/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# syntax=docker/dockerfile:1
FROM modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.6.3-py311-torch2.7.1-vllm0.10.0-modelscope1.28.2-swift3.7.2

ENV DEBIAN_FRONTEND=noninteractive \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1

# Core packages SkyPilot expects on the instance, plus distutils for gpustat build
RUN apt-get update && apt-get install -y --no-install-recommends \
openssh-server \
rsync \
netcat-openbsd \
pciutils \
libpci3 \
fuse3 \
libfuse3-3 \
libfuse2 \
python3.10 \
python3-pip \
python3.10-venv \
python3-distutils \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*

# Ensure SSH server can start (SkyPilot uses SSH inside pods)
RUN mkdir -p /var/run/sshd && \
sed -i 's/#\?PermitRootLogin .*/PermitRootLogin yes/' /etc/ssh/sshd_config || true

# Preinstall Ray and SkyPilot deps into system Python 3.10 so /usr/local/bin/ray works
RUN /usr/bin/python3.10 -m pip install --upgrade "pip<25.1" "setuptools<70" && \
/usr/bin/python3.10 -m pip install --no-cache-dir "gpustat==1.1.1" "ray[default]==2.9.3" "skypilot==0.10.3"

# Also prepare the SkyPilot runtime venv to skip runtime setup where possible
RUN python3.10 -m venv /root/skypilot-runtime && \
. /root/skypilot-runtime/bin/activate && \
pip install --upgrade "pip<25.1" "setuptools<70" && \
pip install --no-cache-dir "gpustat==1.1.1" "ray[default]==2.9.3" "skypilot==0.10.3"

# Keep base image entrypoint/cmd


11 changes: 11 additions & 0 deletions dev/ms-swift-megatron/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# config.yaml
name: ms-swift-megatron

# Single node; single H200 on CoreWeave K8s
num_nodes: 1
resources:
infra: k8s
accelerators: H200:2 # uses the cluster's GPU label catalog
cpus: 16+
memory: 64+
image_id: docker:bradhiltonnw/ms-swift-megatron:skypilot
10 changes: 10 additions & 0 deletions dev/ms-swift-megatron/to-hf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
swift export \
--mcore_adapters megatron_output/Qwen3-235B-A22B-Instruct-2507/vx-xxx \
--to_hf true \
--torch_dtype bfloat16 \
--output_dir megatron_output/Qwen3-235B-A22B-Instruct-2507/vx-xxx-hf \
--test_convert_precision true

10 changes: 10 additions & 0 deletions dev/ms-swift-megatron/to-mcore.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
swift export \
--model Qwen/Qwen3-235B-A22B-Instruct-2507 \
--to_mcore true \
--torch_dtype bfloat16 \
--output_dir Qwen3-235B-A22B-Instruct-2507-mcore \
--test_convert_precision true

46 changes: 46 additions & 0 deletions dev/ms-swift-megatron/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash

export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
export NPROC_PER_NODE=2
export CUDA_VISIBLE_DEVICES=0,1

megatron sft \
--load Qwen3-235B-A22B-Instruct-2507-mcore \
--dataset 'swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT#2000' \
'swift/self-cognition#1000' \
--train_type lora \
--lora_rank 8 \
--lora_alpha 32 \
--target_modules all-linear \
--split_dataset_ratio 0.01 \
--moe_permute_fusion true \
--tensor_model_parallel_size 4 \
--expert_tensor_parallel_size 1 \
--expert_model_parallel_size 8 \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--moe_aux_loss_coeff 1e-3 \
--micro_batch_size 8 \
--global_batch_size 16 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--max_epochs 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-4 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-5 \
--save megatron_output/Qwen3-235B-A22B-Instruct-2507 \
--eval_interval 200 \
--save_interval 200 \
--max_length 2048 \
--num_workers 8 \
--dataset_num_proc 8 \
--no_save_optim true \
--no_save_rng true \
--sequence_parallel true \
--attention_backend flash \
--model_author swift \
--model_name swift-robot