From e1c66178d8e645793ecb71f5f2aed33bc4b7d184 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 16 Sep 2025 00:06:50 +0000
Subject: [PATCH 1/6] feat: Add Qwen3 MoE LoRA SFT config

Configure Qwen3 MoE for LoRA SFT with Megatron-SWIFT.

Co-authored-by: bhilton <bhilton@wandb.com>
---
 dev/ms-swift-megatron/config.yaml | 128 ++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 dev/ms-swift-megatron/config.yaml
diff --git a/dev/ms-swift-megatron/config.yaml b/dev/ms-swift-megatron/config.yaml
new file mode 100644
index 00000000..27bb0397
--- /dev/null
+++ b/dev/ms-swift-megatron/config.yaml
@@ -0,0 +1,128 @@
+# config.yaml
+name: qwen3-moe-lora-sft
+
+# Single node; single H200 on CoreWeave K8s
+num_nodes: 1
+resources:
+  infra: k8s
+  accelerators: H200:1          # uses the cluster's GPU label catalog
+  cpus: 16+
+  memory: 64+
+  # Use a CUDA 12.4 + Torch 2.6 base with swift preinstalled; we will upgrade swift in setup.
+  image_id: docker:modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py310-torch2.6.0-vllm0.8.5.post1-modelscope1.27.1-swift3.5.3
+
+envs:
+  HF_HOME: /cache/hf
+  TRANSFORMERS_CACHE: /cache/hf
+  MODELSCOPE_CACHE: /cache/ms
+  # Uncomment if you need tokenized access:
+  # HF_TOKEN: <paste-if-needed>
+
+# Everything below runs *inside* the container on the K8s node.
+setup: |
+  set -euxo pipefail
+  mkdir -p /cache/hf /cache/ms /workspace/data /workspace/output
+
+  # Build deps for Apex; keep lean.
+  apt-get update && apt-get install -y git build-essential ninja-build && rm -rf /var/lib/apt/lists/*
+
+  # Upgrade ms-swift to get Megatron-SWIFT LoRA support; install deps per docs.
+  python3 -m pip install --upgrade --no-cache-dir pip
+  python3 -m pip install --no-cache-dir \
+      "ms-swift>=3.8" transformers datasets accelerate pybind11
+
+  # TransformerEngine (recommended for fused kernels / Flash attention path).
+  python3 -m pip install --no-build-isolation "git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.3"
+
+  # Apex (commit pinned from ms-swift docs for CUDA 12.4 compatibility).
+  git clone https://github.com/NVIDIA/apex /tmp/apex
+  cd /tmp/apex
+  git checkout e13873debc4699d39c6861074b9a3b2a02327f92
+  python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
+      --config-settings "--build-option=--cpp_ext" \
+      --config-settings "--build-option=--cuda_ext" .
+
+  # Megatron-Core pinned version used by Megatron-SWIFT.
+  python3 -m pip install --no-cache-dir "git+https://github.com/NVIDIA/Megatron-LM.git@core_r0.12.0"
+
+  # Tiny local SFT dataset to avoid external fetches; valid ms-swift "messages" format.
+  python3 - <<'PY'
+import json, os
+os.makedirs('/workspace/data', exist_ok=True)
+rows = [
+  {"messages":[{"role":"system","content":"You are helpful."},{"role":"user","content":"Write a haiku about the ocean."},{"role":"assistant","content":"Waves count ancient time /\nSalt maps the moonlit harbor /\nGulls stitch dawn to foam."}]},
+  {"messages":[{"role":"user","content":"Summarize: Transformers let tokens attend to each other."},{"role":"assistant","content":"Transformers use attention so tokens can select relevant context in parallel, enabling efficient, scalable sequence modeling."}]},
+  {"messages":[{"role":"user","content":"Convert 37 °C to Fahrenheit."},{"role":"assistant","content":"37 °C is 98.6 °F."}]},
+  {"messages":[{"role":"user","content":"List three prime numbers under 10."},{"role":"assistant","content":"2, 3, and 5 (and 7 is a bonus)."}]},
+  {"messages":[{"role":"user","content":"Explain Mixture-of-Experts in one sentence."},{"role":"assistant","content":"MoE routes each token through a small subset of specialized feed-forward experts, increasing capacity without proportional compute."}]}
+]
+with open('/workspace/data/minisft.jsonl','w') as f:
+  for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n")
+PY
+
+run: |
+  set -euxo pipefail
+  export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+  # 1) Convert Hugging Face weights -> Megatron-Core format (Qwen3 MoE)
+  #    Smallest open Qwen3 MoE: Qwen3-30B-A3B (≈3B active params).
+  swift export \
+    --model Qwen/Qwen3-30B-A3B-Instruct-2507 \
+    --to_mcore true \
+    --torch_dtype bfloat16 \
+    --output_dir /workspace/qwen3-30b-a3b-mcore \
+    --test_convert_precision true
+
+  # 2) Single-GPU LoRA SFT with Megatron-SWIFT (memory-friendly knobs)
+  #    LoRA + Flash attention + selective recompute on attention+MoE; SP on.
+  megatron sft \
+    --load /workspace/qwen3-30b-a3b-mcore \
+    --dataset /workspace/data/minisft.jsonl \
+    --train_type lora \
+    --lora_rank 8 \
+    --lora_alpha 16 \
+    --target_modules all-linear \
+    --attention_backend flash \
+    --sequence_parallel true \
+    --recompute_granularity selective \
+    --recompute_modules core_attn moe \
+    --micro_batch_size 2 \
+    --global_batch_size 8 \
+    --finetune true \
+    --lr 1e-4 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-5 \
+    --max_epochs 1 \
+    --save /workspace/output/qwen3-30b-a3b-lora \
+    --save_interval 100000 \
+    --max_length 1024 \
+    --num_workers 2 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --log_throughput true
+
+  # 3) Merge LoRA adapters -> HF format (for vLLM/Transformers/TGI)
+  LAST=$(ls -d /workspace/output/qwen3-30b-a3b-lora/v* | sort | tail -n1)
+  swift export \
+    --mcore_adapters "$LAST" \
+    --to_hf true \
+    --torch_dtype bfloat16 \
+    --output_dir /workspace/output/qwen3-30b-a3b-lora-hf \
+    --test_convert_precision true
+
+  # 4) Smoke test: load merged HF model and generate one answer
+  python3 - <<'PY'
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch, os
+mdir="/workspace/output/qwen3-30b-a3b-lora-hf"
+tok=AutoTokenizer.from_pretrained(mdir)
+model=AutoModelForCausalLM.from_pretrained(mdir, torch_dtype=torch.bfloat16, device_map="auto")
+prompt="User: In one sentence, what is LoRA?\nAssistant:"
+x=tok(prompt, return_tensors="pt").to(model.device)
+y=model.generate(**x, max_new_tokens=32, do_sample=False)
+print(tok.decode(y[0], skip_special_tokens=True))
+PY
+
+config:
+  kubernetes:
+    provision_timeout: 900
\ No newline at end of file

From 38559009fe16442a016caa0a474492102de95f61 Mon Sep 17 00:00:00 2001
From: Brad Hilton <brad.hilton.nw@gmail.com>
Date: Mon, 15 Sep 2025 18:37:45 -0600
Subject: [PATCH 2/6] refactor: Update config.yaml for improved setup and run
 scripts

Enhanced the setup and run sections in config.yaml by adjusting indentation and ensuring proper execution of Python scripts for data generation and model testing. This improves readability and maintainability of the configuration.
---
 dev/ms-swift-megatron/config.yaml | 48 +++++++++++++++----------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/dev/ms-swift-megatron/config.yaml b/dev/ms-swift-megatron/config.yaml
index 27bb0397..094dfdd2 100644
--- a/dev/ms-swift-megatron/config.yaml
+++ b/dev/ms-swift-megatron/config.yaml
@@ -19,7 +19,7 @@ envs:
   # HF_TOKEN: <paste-if-needed>
 
 # Everything below runs *inside* the container on the K8s node.
-setup: |
+setup: |2
   set -euxo pipefail
   mkdir -p /cache/hf /cache/ms /workspace/data /workspace/output
 
@@ -47,20 +47,20 @@ setup: |
 
   # Tiny local SFT dataset to avoid external fetches; valid ms-swift "messages" format.
   python3 - <<'PY'
-import json, os
-os.makedirs('/workspace/data', exist_ok=True)
-rows = [
-  {"messages":[{"role":"system","content":"You are helpful."},{"role":"user","content":"Write a haiku about the ocean."},{"role":"assistant","content":"Waves count ancient time /\nSalt maps the moonlit harbor /\nGulls stitch dawn to foam."}]},
-  {"messages":[{"role":"user","content":"Summarize: Transformers let tokens attend to each other."},{"role":"assistant","content":"Transformers use attention so tokens can select relevant context in parallel, enabling efficient, scalable sequence modeling."}]},
-  {"messages":[{"role":"user","content":"Convert 37 °C to Fahrenheit."},{"role":"assistant","content":"37 °C is 98.6 °F."}]},
-  {"messages":[{"role":"user","content":"List three prime numbers under 10."},{"role":"assistant","content":"2, 3, and 5 (and 7 is a bonus)."}]},
-  {"messages":[{"role":"user","content":"Explain Mixture-of-Experts in one sentence."},{"role":"assistant","content":"MoE routes each token through a small subset of specialized feed-forward experts, increasing capacity without proportional compute."}]}
-]
-with open('/workspace/data/minisft.jsonl','w') as f:
-  for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n")
-PY
+  import json, os
+  os.makedirs('/workspace/data', exist_ok=True)
+  rows = [
+    {"messages":[{"role":"system","content":"You are helpful."},{"role":"user","content":"Write a haiku about the ocean."},{"role":"assistant","content":"Waves count ancient time /\nSalt maps the moonlit harbor /\nGulls stitch dawn to foam."}]},
+    {"messages":[{"role":"user","content":"Summarize: Transformers let tokens attend to each other."},{"role":"assistant","content":"Transformers use attention so tokens can select relevant context in parallel, enabling efficient, scalable sequence modeling."}]},
+    {"messages":[{"role":"user","content":"Convert 37 °C to Fahrenheit."},{"role":"assistant","content":"37 °C is 98.6 °F."}]},
+    {"messages":[{"role":"user","content":"List three prime numbers under 10."},{"role":"assistant","content":"2, 3, and 5 (and 7 is a bonus)."}]},
+    {"messages":[{"role":"user","content":"Explain Mixture-of-Experts in one sentence."},{"role":"assistant","content":"MoE routes each token through a small subset of specialized feed-forward experts, increasing capacity without proportional compute."}]}
+  ]
+  with open('/workspace/data/minisft.jsonl','w') as f:
+    for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n")
+  PY
 
-run: |
+run: |2
   set -euxo pipefail
   export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 
@@ -112,16 +112,16 @@ run: |
 
   # 4) Smoke test: load merged HF model and generate one answer
   python3 - <<'PY'
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch, os
-mdir="/workspace/output/qwen3-30b-a3b-lora-hf"
-tok=AutoTokenizer.from_pretrained(mdir)
-model=AutoModelForCausalLM.from_pretrained(mdir, torch_dtype=torch.bfloat16, device_map="auto")
-prompt="User: In one sentence, what is LoRA?\nAssistant:"
-x=tok(prompt, return_tensors="pt").to(model.device)
-y=model.generate(**x, max_new_tokens=32, do_sample=False)
-print(tok.decode(y[0], skip_special_tokens=True))
-PY
+  from transformers import AutoTokenizer, AutoModelForCausalLM
+  import torch, os
+  mdir="/workspace/output/qwen3-30b-a3b-lora-hf"
+  tok=AutoTokenizer.from_pretrained(mdir)
+  model=AutoModelForCausalLM.from_pretrained(mdir, torch_dtype=torch.bfloat16, device_map="auto")
+  prompt="User: In one sentence, what is LoRA?\nAssistant:"
+  x=tok(prompt, return_tensors="pt").to(model.device)
+  y=model.generate(**x, max_new_tokens=32, do_sample=False)
+  print(tok.decode(y[0], skip_special_tokens=True))
+  PY
 
 config:
   kubernetes:

From 9ec32779f03097645dfadfbfc845d7b7b6ec7eb7 Mon Sep 17 00:00:00 2001
From: Brad Hilton <brad.hilton.nw@gmail.com>
Date: Tue, 16 Sep 2025 09:44:57 -0600
Subject: [PATCH 3/6] chore: Update config.yaml and add Dockerfile for
 ms-swift-megatron

Modified config.yaml to rename the model and adjust GPU resources. Added a new Dockerfile to set up the environment with necessary dependencies for ms-swift-megatron, including SSH server configuration and preinstalled packages for SkyPilot.
---
 dev/ms-swift-megatron/Dockerfile  |  42 ++++++++++
 dev/ms-swift-megatron/config.yaml | 123 +-----------------------------
 2 files changed, 45 insertions(+), 120 deletions(-)
 create mode 100644 dev/ms-swift-megatron/Dockerfile

diff --git a/dev/ms-swift-megatron/Dockerfile b/dev/ms-swift-megatron/Dockerfile
new file mode 100644
index 00000000..8a359c93
--- /dev/null
+++ b/dev/ms-swift-megatron/Dockerfile
@@ -0,0 +1,42 @@
+# syntax=docker/dockerfile:1
+FROM modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.6.3-py311-torch2.7.1-vllm0.10.0-modelscope1.28.2-swift3.7.2
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+
+# Core packages SkyPilot expects on the instance, plus distutils for gpustat build
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        openssh-server \
+        rsync \
+        netcat-openbsd \
+        pciutils \
+        libpci3 \
+        fuse3 \
+        libfuse3-3 \
+        libfuse2 \
+        python3.10 \
+        python3-pip \
+        python3.10-venv \
+        python3-distutils \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Ensure SSH server can start (SkyPilot uses SSH inside pods)
+RUN mkdir -p /var/run/sshd && \
+    sed -i 's/#\?PermitRootLogin .*/PermitRootLogin yes/' /etc/ssh/sshd_config || true
+
+# Preinstall Ray and SkyPilot deps into system Python 3.10 so /usr/local/bin/ray works
+RUN /usr/bin/python3.10 -m pip install --upgrade "pip<25.1" "setuptools<70" && \
+    /usr/bin/python3.10 -m pip install --no-cache-dir "gpustat==1.1.1" "ray[default]==2.9.3" "skypilot==0.10.3"
+
+# Also prepare the SkyPilot runtime venv to skip runtime setup where possible
+RUN python3.10 -m venv /root/skypilot-runtime && \
+    . /root/skypilot-runtime/bin/activate && \
+    pip install --upgrade "pip<25.1" "setuptools<70" && \
+    pip install --no-cache-dir "gpustat==1.1.1" "ray[default]==2.9.3" "skypilot==0.10.3"
+
+# Keep base image entrypoint/cmd
+
+
diff --git a/dev/ms-swift-megatron/config.yaml b/dev/ms-swift-megatron/config.yaml
index 094dfdd2..ea9a5caf 100644
--- a/dev/ms-swift-megatron/config.yaml
+++ b/dev/ms-swift-megatron/config.yaml
@@ -1,128 +1,11 @@
 # config.yaml
-name: qwen3-moe-lora-sft
+name: ms-swift-megatron
 
 # Single node; single H200 on CoreWeave K8s
 num_nodes: 1
 resources:
   infra: k8s
-  accelerators: H200:1          # uses the cluster's GPU label catalog
+  accelerators: H200:2 # uses the cluster's GPU label catalog
   cpus: 16+
   memory: 64+
-  # Use a CUDA 12.4 + Torch 2.6 base with swift preinstalled; we will upgrade swift in setup.
-  image_id: docker:modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py310-torch2.6.0-vllm0.8.5.post1-modelscope1.27.1-swift3.5.3
-
-envs:
-  HF_HOME: /cache/hf
-  TRANSFORMERS_CACHE: /cache/hf
-  MODELSCOPE_CACHE: /cache/ms
-  # Uncomment if you need tokenized access:
-  # HF_TOKEN: <paste-if-needed>
-
-# Everything below runs *inside* the container on the K8s node.
-setup: |2
-  set -euxo pipefail
-  mkdir -p /cache/hf /cache/ms /workspace/data /workspace/output
-
-  # Build deps for Apex; keep lean.
-  apt-get update && apt-get install -y git build-essential ninja-build && rm -rf /var/lib/apt/lists/*
-
-  # Upgrade ms-swift to get Megatron-SWIFT LoRA support; install deps per docs.
-  python3 -m pip install --upgrade --no-cache-dir pip
-  python3 -m pip install --no-cache-dir \
-      "ms-swift>=3.8" transformers datasets accelerate pybind11
-
-  # TransformerEngine (recommended for fused kernels / Flash attention path).
-  python3 -m pip install --no-build-isolation "git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.3"
-
-  # Apex (commit pinned from ms-swift docs for CUDA 12.4 compatibility).
-  git clone https://github.com/NVIDIA/apex /tmp/apex
-  cd /tmp/apex
-  git checkout e13873debc4699d39c6861074b9a3b2a02327f92
-  python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
-      --config-settings "--build-option=--cpp_ext" \
-      --config-settings "--build-option=--cuda_ext" .
-
-  # Megatron-Core pinned version used by Megatron-SWIFT.
-  python3 -m pip install --no-cache-dir "git+https://github.com/NVIDIA/Megatron-LM.git@core_r0.12.0"
-
-  # Tiny local SFT dataset to avoid external fetches; valid ms-swift "messages" format.
-  python3 - <<'PY'
-  import json, os
-  os.makedirs('/workspace/data', exist_ok=True)
-  rows = [
-    {"messages":[{"role":"system","content":"You are helpful."},{"role":"user","content":"Write a haiku about the ocean."},{"role":"assistant","content":"Waves count ancient time /\nSalt maps the moonlit harbor /\nGulls stitch dawn to foam."}]},
-    {"messages":[{"role":"user","content":"Summarize: Transformers let tokens attend to each other."},{"role":"assistant","content":"Transformers use attention so tokens can select relevant context in parallel, enabling efficient, scalable sequence modeling."}]},
-    {"messages":[{"role":"user","content":"Convert 37 °C to Fahrenheit."},{"role":"assistant","content":"37 °C is 98.6 °F."}]},
-    {"messages":[{"role":"user","content":"List three prime numbers under 10."},{"role":"assistant","content":"2, 3, and 5 (and 7 is a bonus)."}]},
-    {"messages":[{"role":"user","content":"Explain Mixture-of-Experts in one sentence."},{"role":"assistant","content":"MoE routes each token through a small subset of specialized feed-forward experts, increasing capacity without proportional compute."}]}
-  ]
-  with open('/workspace/data/minisft.jsonl','w') as f:
-    for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n")
-  PY
-
-run: |2
-  set -euxo pipefail
-  export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-
-  # 1) Convert Hugging Face weights -> Megatron-Core format (Qwen3 MoE)
-  #    Smallest open Qwen3 MoE: Qwen3-30B-A3B (≈3B active params).
-  swift export \
-    --model Qwen/Qwen3-30B-A3B-Instruct-2507 \
-    --to_mcore true \
-    --torch_dtype bfloat16 \
-    --output_dir /workspace/qwen3-30b-a3b-mcore \
-    --test_convert_precision true
-
-  # 2) Single-GPU LoRA SFT with Megatron-SWIFT (memory-friendly knobs)
-  #    LoRA + Flash attention + selective recompute on attention+MoE; SP on.
-  megatron sft \
-    --load /workspace/qwen3-30b-a3b-mcore \
-    --dataset /workspace/data/minisft.jsonl \
-    --train_type lora \
-    --lora_rank 8 \
-    --lora_alpha 16 \
-    --target_modules all-linear \
-    --attention_backend flash \
-    --sequence_parallel true \
-    --recompute_granularity selective \
-    --recompute_modules core_attn moe \
-    --micro_batch_size 2 \
-    --global_batch_size 8 \
-    --finetune true \
-    --lr 1e-4 \
-    --lr_warmup_fraction 0.05 \
-    --min_lr 1e-5 \
-    --max_epochs 1 \
-    --save /workspace/output/qwen3-30b-a3b-lora \
-    --save_interval 100000 \
-    --max_length 1024 \
-    --num_workers 2 \
-    --no_save_optim true \
-    --no_save_rng true \
-    --log_throughput true
-
-  # 3) Merge LoRA adapters -> HF format (for vLLM/Transformers/TGI)
-  LAST=$(ls -d /workspace/output/qwen3-30b-a3b-lora/v* | sort | tail -n1)
-  swift export \
-    --mcore_adapters "$LAST" \
-    --to_hf true \
-    --torch_dtype bfloat16 \
-    --output_dir /workspace/output/qwen3-30b-a3b-lora-hf \
-    --test_convert_precision true
-
-  # 4) Smoke test: load merged HF model and generate one answer
-  python3 - <<'PY'
-  from transformers import AutoTokenizer, AutoModelForCausalLM
-  import torch, os
-  mdir="/workspace/output/qwen3-30b-a3b-lora-hf"
-  tok=AutoTokenizer.from_pretrained(mdir)
-  model=AutoModelForCausalLM.from_pretrained(mdir, torch_dtype=torch.bfloat16, device_map="auto")
-  prompt="User: In one sentence, what is LoRA?\nAssistant:"
-  x=tok(prompt, return_tensors="pt").to(model.device)
-  y=model.generate(**x, max_new_tokens=32, do_sample=False)
-  print(tok.decode(y[0], skip_special_tokens=True))
-  PY
-
-config:
-  kubernetes:
-    provision_timeout: 900
\ No newline at end of file
+  image_id: docker:bradhiltonnw/ms-swift-megatron:skypilot

From f3b9e03c5cd3a7e58b992fcf626db5eea2fd9b48 Mon Sep 17 00:00:00 2001
From: Brad Hilton <brad.hilton.nw@gmail.com>
Date: Tue, 16 Sep 2025 10:57:22 -0600
Subject: [PATCH 4/6] chore: Add ms-swift megatron scripts

---
 dev/ms-swift-megatron/to-hf.sh    | 10 ++++++++
 dev/ms-swift-megatron/to-mcore.sh | 10 ++++++++
 dev/ms-swift-megatron/train.sh    | 40 +++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)
 create mode 100755 dev/ms-swift-megatron/to-hf.sh
 create mode 100755 dev/ms-swift-megatron/to-mcore.sh
 create mode 100755 dev/ms-swift-megatron/train.sh

diff --git a/dev/ms-swift-megatron/to-hf.sh b/dev/ms-swift-megatron/to-hf.sh
new file mode 100755
index 00000000..2d684d2f
--- /dev/null
+++ b/dev/ms-swift-megatron/to-hf.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0,1 \
+swift export \
+    --mcore_adapters megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx \
+    --to_hf true \
+    --torch_dtype bfloat16 \
+    --output_dir megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx-hf \
+    --test_convert_precision true
+    
\ No newline at end of file
diff --git a/dev/ms-swift-megatron/to-mcore.sh b/dev/ms-swift-megatron/to-mcore.sh
new file mode 100755
index 00000000..5a01a5de
--- /dev/null
+++ b/dev/ms-swift-megatron/to-mcore.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0,1 \
+swift export \
+    --model Qwen/Qwen3-30B-A3B-Instruct-2507 \
+    --to_mcore true \
+    --torch_dtype bfloat16 \
+    --output_dir Qwen3-30B-A3B-Instruct-2507-mcore \
+    --test_convert_precision true
+    
\ No newline at end of file
diff --git a/dev/ms-swift-megatron/train.sh b/dev/ms-swift-megatron/train.sh
new file mode 100755
index 00000000..18778eb1
--- /dev/null
+++ b/dev/ms-swift-megatron/train.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
+export NPROC_PER_NODE=2
+export CUDA_VISIBLE_DEVICES=0,1
+
+megatron sft \
+    --load Qwen/Qwen3-30B-A3B-Instruct-2507-mcore \
+    --dataset \
+        'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+        'AI-ModelScope/alpaca-gpt4-data-en#500' \
+        'swift/self-cognition#500' \
+    --train_type lora \
+    --lora_rank 8 \
+    --lora_alpha 16 \
+    --target_modules linear_qkv linear_proj \
+    --tensor_model_parallel_size 2 \
+    --sequence_parallel true \
+    --micro_batch_size 16 \
+    --global_batch_size 16 \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    --finetune true \
+    --cross_entropy_loss_fusion true \
+    --lr 1e-4 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-5 \
+    --max_epochs 1 \
+    --save megatron_output/Qwen3-30B-A3B-Instruct-2507 \
+    --save_interval 100 \
+    --max_length 2048 \
+    --system 'You are a helpful assistant.' \
+    --num_workers 4 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --dataset_num_proc 4 \
+    --model_author swift \
+    --model_name swift-robot
+    
\ No newline at end of file

From 7d4ed0ec2c2cb95c9601e4ed59b4b0e9fce50180 Mon Sep 17 00:00:00 2001
From: Brad Hilton <brad.hilton.nw@gmail.com>
Date: Wed, 17 Sep 2025 03:00:32 +0800
Subject: [PATCH 5/6] fix: Correct model loading path in train.sh script

---
 dev/ms-swift-megatron/train.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/ms-swift-megatron/train.sh b/dev/ms-swift-megatron/train.sh
index 18778eb1..7c6a661d 100755
--- a/dev/ms-swift-megatron/train.sh
+++ b/dev/ms-swift-megatron/train.sh
@@ -5,7 +5,7 @@ export NPROC_PER_NODE=2
 export CUDA_VISIBLE_DEVICES=0,1
 
 megatron sft \
-    --load Qwen/Qwen3-30B-A3B-Instruct-2507-mcore \
+    --load Qwen3-30B-A3B-Instruct-2507-mcore \
     --dataset \
         'AI-ModelScope/alpaca-gpt4-data-zh#500' \
         'AI-ModelScope/alpaca-gpt4-data-en#500' \

From 9fee40e9ff7daac71fde6579b3ba189a2dd79686 Mon Sep 17 00:00:00 2001
From: Brad Hilton <brad.hilton.nw@gmail.com>
Date: Wed, 17 Sep 2025 07:54:19 +0800
Subject: [PATCH 6/6] chore: Update model paths and GPU configuration in
 megatron scripts

Modified to-hf.sh, to-mcore.sh, and train.sh to use the new model version Qwen3-235B-A22B-Instruct-2507 and expanded CUDA_VISIBLE_DEVICES to include more GPUs. Adjusted dataset and training parameters in train.sh for improved performance.
---
 dev/ms-swift-megatron/to-hf.sh    |  6 ++---
 dev/ms-swift-megatron/to-mcore.sh |  6 ++---
 dev/ms-swift-megatron/train.sh    | 38 ++++++++++++++++++-------------
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/dev/ms-swift-megatron/to-hf.sh b/dev/ms-swift-megatron/to-hf.sh
index 2d684d2f..51bafae3 100755
--- a/dev/ms-swift-megatron/to-hf.sh
+++ b/dev/ms-swift-megatron/to-hf.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0,1 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 swift export \
-    --mcore_adapters megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx \
+    --mcore_adapters megatron_output/Qwen3-235B-A22B-Instruct-2507/vx-xxx \
     --to_hf true \
     --torch_dtype bfloat16 \
-    --output_dir megatron_output/Qwen3-30B-A3B-Instruct-2507/vx-xxx-hf \
+    --output_dir megatron_output/Qwen3-235B-A22B-Instruct-2507/vx-xxx-hf \
     --test_convert_precision true
     
\ No newline at end of file
diff --git a/dev/ms-swift-megatron/to-mcore.sh b/dev/ms-swift-megatron/to-mcore.sh
index 5a01a5de..89eeec8e 100755
--- a/dev/ms-swift-megatron/to-mcore.sh
+++ b/dev/ms-swift-megatron/to-mcore.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0,1 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 swift export \
-    --model Qwen/Qwen3-30B-A3B-Instruct-2507 \
+    --model Qwen/Qwen3-235B-A22B-Instruct-2507 \
     --to_mcore true \
     --torch_dtype bfloat16 \
-    --output_dir Qwen3-30B-A3B-Instruct-2507-mcore \
+    --output_dir Qwen3-235B-A22B-Instruct-2507-mcore \
     --test_convert_precision true
     
\ No newline at end of file
diff --git a/dev/ms-swift-megatron/train.sh b/dev/ms-swift-megatron/train.sh
index 7c6a661d..eaf2e4ce 100755
--- a/dev/ms-swift-megatron/train.sh
+++ b/dev/ms-swift-megatron/train.sh
@@ -5,36 +5,42 @@ export NPROC_PER_NODE=2
 export CUDA_VISIBLE_DEVICES=0,1
 
 megatron sft \
-    --load Qwen3-30B-A3B-Instruct-2507-mcore \
-    --dataset \
-        'AI-ModelScope/alpaca-gpt4-data-zh#500' \
-        'AI-ModelScope/alpaca-gpt4-data-en#500' \
-        'swift/self-cognition#500' \
+    --load Qwen3-235B-A22B-Instruct-2507-mcore \
+    --dataset 'swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT#2000' \
+              'swift/self-cognition#1000' \
     --train_type lora \
     --lora_rank 8 \
-    --lora_alpha 16 \
-    --target_modules linear_qkv linear_proj \
-    --tensor_model_parallel_size 2 \
-    --sequence_parallel true \
-    --micro_batch_size 16 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --split_dataset_ratio 0.01 \
+    --moe_permute_fusion true \
+    --tensor_model_parallel_size 4 \
+    --expert_tensor_parallel_size 1 \
+    --expert_model_parallel_size 8 \
+    --moe_grouped_gemm true \
+    --moe_shared_expert_overlap true \
+    --moe_aux_loss_coeff 1e-3 \
+    --micro_batch_size 8 \
     --global_batch_size 16 \
     --recompute_granularity full \
     --recompute_method uniform \
     --recompute_num_layers 1 \
+    --max_epochs 1 \
     --finetune true \
     --cross_entropy_loss_fusion true \
     --lr 1e-4 \
     --lr_warmup_fraction 0.05 \
     --min_lr 1e-5 \
-    --max_epochs 1 \
-    --save megatron_output/Qwen3-30B-A3B-Instruct-2507 \
-    --save_interval 100 \
+    --save megatron_output/Qwen3-235B-A22B-Instruct-2507 \
+    --eval_interval 200 \
+    --save_interval 200 \
     --max_length 2048 \
-    --system 'You are a helpful assistant.' \
-    --num_workers 4 \
+    --num_workers 8 \
+    --dataset_num_proc 8 \
     --no_save_optim true \
     --no_save_rng true \
-    --dataset_num_proc 4 \
+    --sequence_parallel true \
+    --attention_backend flash \
     --model_author swift \
     --model_name swift-robot
     
\ No newline at end of file