OpenPipe · bradhilton · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/dev/ms-swift-megatron/Dockerfile b/dev/ms-swift-megatron/Dockerfile
@@ -0,0 +1,42 @@
+# syntax=docker/dockerfile:1
+FROM modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.6.3-py311-torch2.7.1-vllm0.10.0-modelscope1.28.2-swift3.7.2
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+
+# Core packages SkyPilot expects on the instance, plus distutils for gpustat build
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        openssh-server \
+        rsync \
+        netcat-openbsd \
+        pciutils \
+        libpci3 \
+        fuse3 \
+        libfuse3-3 \
+        libfuse2 \
+        python3.10 \
+        python3-pip \
+        python3.10-venv \
+        python3-distutils \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Ensure SSH server can start (SkyPilot uses SSH inside pods)
+RUN mkdir -p /var/run/sshd && \
+    sed -i 's/#\?PermitRootLogin .*/PermitRootLogin yes/' /etc/ssh/sshd_config || true
+
+# Preinstall Ray and SkyPilot deps into system Python 3.10 so /usr/local/bin/ray works
+RUN /usr/bin/python3.10 -m pip install --upgrade "pip<25.1" "setuptools<70" && \
+    /usr/bin/python3.10 -m pip install --no-cache-dir "gpustat==1.1.1" "ray[default]==2.9.3" "skypilot==0.10.3"
+
+# Also prepare the SkyPilot runtime venv to skip runtime setup where possible
+RUN python3.10 -m venv /root/skypilot-runtime && \
+    . /root/skypilot-runtime/bin/activate && \
+    pip install --upgrade "pip<25.1" "setuptools<70" && \
+    pip install --no-cache-dir "gpustat==1.1.1" "ray[default]==2.9.3" "skypilot==0.10.3"
+
+# Keep base image entrypoint/cmd
+
+
diff --git a/dev/ms-swift-megatron/config.yaml b/dev/ms-swift-megatron/config.yaml
@@ -0,0 +1,11 @@
+# config.yaml
+name: ms-swift-megatron
+
+# Single node; single H200 on CoreWeave K8s
+num_nodes: 1
+resources:
+  infra: k8s
+  accelerators: H200:2 # uses the cluster's GPU label catalog
+  cpus: 16+
+  memory: 64+
+  image_id: docker:bradhiltonnw/ms-swift-megatron:skypilot
diff --git a/dev/ms-swift-megatron/to-hf.sh b/dev/ms-swift-megatron/to-hf.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+swift export \
+    --mcore_adapters megatron_output/Qwen3-235B-A22B-Instruct-2507/vx-xxx \
+    --to_hf true \
+    --torch_dtype bfloat16 \
+    --output_dir megatron_output/Qwen3-235B-A22B-Instruct-2507/vx-xxx-hf \
+    --test_convert_precision true
+
diff --git a/dev/ms-swift-megatron/to-mcore.sh b/dev/ms-swift-megatron/to-mcore.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+swift export \
+    --model Qwen/Qwen3-235B-A22B-Instruct-2507 \
+    --to_mcore true \
+    --torch_dtype bfloat16 \
+    --output_dir Qwen3-235B-A22B-Instruct-2507-mcore \
+    --test_convert_precision true
+
diff --git a/dev/ms-swift-megatron/train.sh b/dev/ms-swift-megatron/train.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
+export NPROC_PER_NODE=2
+export CUDA_VISIBLE_DEVICES=0,1
+
+megatron sft \
+    --load Qwen3-235B-A22B-Instruct-2507-mcore \
+    --dataset 'swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT#2000' \
+              'swift/self-cognition#1000' \
+    --train_type lora \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --split_dataset_ratio 0.01 \
+    --moe_permute_fusion true \
+    --tensor_model_parallel_size 4 \
+    --expert_tensor_parallel_size 1 \
+    --expert_model_parallel_size 8 \
+    --moe_grouped_gemm true \
+    --moe_shared_expert_overlap true \
+    --moe_aux_loss_coeff 1e-3 \
+    --micro_batch_size 8 \
+    --global_batch_size 16 \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    --max_epochs 1 \
+    --finetune true \
+    --cross_entropy_loss_fusion true \
+    --lr 1e-4 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-5 \
+    --save megatron_output/Qwen3-235B-A22B-Instruct-2507 \
+    --eval_interval 200 \
+    --save_interval 200 \
+    --max_length 2048 \
+    --num_workers 8 \
+    --dataset_num_proc 8 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --sequence_parallel true \
+    --attention_backend flash \
+    --model_author swift \
+    --model_name swift-robot
+