Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
761a335
make provisioner an actor
DNXie Oct 17, 2025
1ac4bfa
change monarch commit to main
DNXie Oct 20, 2025
b3a18ec
Enable MAST client mode (#405)
allenwang28 Oct 16, 2025
a93e135
Fix: Sync `ref_model`'s `seq_len` with trainer configuration in grpo …
wukaixingxp Oct 16, 2025
54beecd
Update Tutorial Docs: PolicyActor -> Policy Actor (#436)
Jack-Khuu Oct 16, 2025
dd79119
Add warning for existing build/ directory in env setup (#441)
casteryh Oct 16, 2025
fd208cc
Add FORGE_DISABLE_METRICS check when init_backend (#440)
DNXie Oct 17, 2025
299a9ee
fix broken qwen3_8b config (#450)
casteryh Oct 17, 2025
6b317bb
Remove chat environment (#443)
Jack-Khuu Oct 17, 2025
51b8db4
Remove unused CLI setup related files (#442)
Jack-Khuu Oct 17, 2025
c39991f
Remove unused interfaces and types (#446)
Jack-Khuu Oct 17, 2025
9499c29
clean up torchtune (#452)
DNXie Oct 17, 2025
e2ce0cf
Make checkpoint saving `folder` clear in the config (#444)
DNXie Oct 17, 2025
ac871ca
Move config into utils (#453)
Jack-Khuu Oct 17, 2025
c4fac79
Change Forge to TorchForge in README (#454)
allenwang28 Oct 17, 2025
d7e9f3e
typo: Fixing missed rename in vllm_policy test (#457)
Jack-Khuu Oct 17, 2025
84fd26e
GRPO Jupyter Notebook (#458)
pbontrager Oct 17, 2025
79145e8
fix - Metric logging work with new monarch API (#451)
felipemello1 Oct 17, 2025
d07e599
wandb hang fix - add timeout (#460)
felipemello1 Oct 17, 2025
36dc45c
Enable RDMA by default (#461)
allenwang28 Oct 17, 2025
c18d610
Normalize 'TorchForge' to 'torchforge' in README (#462)
jspisak Oct 18, 2025
7d8ee61
shared memory multiprocess prefetch for weight update (#430)
casteryh Oct 18, 2025
cd9fc37
Updating outdated tests/README (#459)
Jack-Khuu Oct 19, 2025
bce693e
enable vLLM upload with CUDA 12.8 build (#465)
ebsmothers Oct 20, 2025
7b656be
make provisioner an actor
DNXie Oct 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 35 additions & 36 deletions .github/packaging/vllm_reqs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,20 @@
# See the file .github/workflows/gpu_test.yaml for an E2E forge installation using this approach.
# TODO: this should be done way less hackily
aiohappyeyeballs==2.6.1
aiohttp==3.13.0
aiohttp==3.13.1
aiosignal==1.4.0
annotated-types==0.7.0
anyio==4.11.0
astor==0.8.1
async-timeout==5.0.1
attrs==25.4.0
blake3==1.0.7
cachetools==6.2.0
blake3==1.0.8
cachetools==6.2.1
cbor2==5.7.0
certifi==2025.10.5
cffi==2.0.0
charset-normalizer==3.4.3
click==8.3.0
charset-normalizer==3.4.4
click==8.2.1
cloudpickle==3.1.1
cmake==4.1.0
compressed-tensors==0.10.2
Expand All @@ -33,7 +33,7 @@ dnspython==2.8.0
einops==0.8.1
email-validator==2.3.0
exceptiongroup==1.3.0
fastapi==0.118.3
fastapi==0.119.0
fastapi-cli==0.0.13
fastapi-cloud-cli==0.3.1
fastrlock==0.8.3
Expand All @@ -47,81 +47,80 @@ httpcore==1.0.9
httptools==0.7.1
httpx==0.28.1
huggingface-hub==0.35.3
idna==3.10
idna==3.11
interegular==0.3.3
Jinja2==3.1.6
jiter==0.11.0
jiter==0.11.1
jsonschema==4.25.1
jsonschema-specifications==2025.9.1
lark==1.2.2
llguidance==0.7.30
llvmlite==0.44.0
lm-format-enforcer==0.10.12
markdown-it-py==4.0.0
MarkupSafe==3.0.2
MarkupSafe==2.1.5
mdurl==0.1.2
mistral_common==1.8.5
mpmath==1.3.0
msgpack==1.1.2
msgspec==0.19.0
multidict==6.7.0
networkx==3.4.2
networkx==3.3
ninja==1.13.0
numba==0.61.2
numpy==2.2.6
nvidia-cublas-cu12==12.9.1.4
nvidia-cuda-cupti-cu12==12.9.79
nvidia-cuda-nvrtc-cu12==12.9.86
nvidia-cuda-runtime-cu12==12.9.79
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cufft-cu12==11.4.1.4
nvidia-cufile-cu12==1.14.1.1
nvidia-curand-cu12==10.3.10.19
nvidia-cusolver-cu12==11.7.5.82
nvidia-cusparse-cu12==12.5.10.65
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-nccl-cu12==2.27.5
nvidia-nvjitlink-cu12==12.9.86
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu12==12.9.79
nvidia-nvtx-cu12==12.8.90
openai==1.90.0
opencv-python-headless==4.12.0.88
outlines_core==0.2.10
packaging==25.0
partial-json-parser==0.2.1.1.post6
pillow==11.3.0
pillow==12.0.0
prometheus-fastapi-instrumentator==7.1.0
prometheus_client==0.23.1
propcache==0.4.1
protobuf==6.32.1
protobuf==6.33.0
psutil==7.1.0
py-cpuinfo==9.0.0
pybase64==1.4.2
pycountry==24.6.1
pycparser==2.23
pydantic==2.12.0
pydantic==2.12.3
pydantic-extra-types==2.10.6
pydantic_core==2.41.1
pydantic_core==2.41.4
Pygments==2.19.2
python-dotenv==1.1.1
python-json-logger==4.0.0
python-multipart==0.0.20
pytorch-triton==3.4.0+gitf7888497
PyYAML==6.0.3
pyzmq==27.1.0
ray==2.49.2
referencing==0.36.2
ray==2.50.0
referencing==0.37.0
regex==2025.9.18
requests==2.32.5
rich==14.2.0
rich-toolkit==0.15.1
rignore==0.7.0
rignore==0.7.1
rpds-py==0.27.1
safetensors==0.6.2
scipy==1.15.3
sentencepiece==0.2.1
sentry-sdk==2.41.0
setuptools-scm==9.2.0
sentry-sdk==2.42.0
setuptools-scm==9.2.1
shellingham==1.5.4
sniffio==1.3.1
soundfile==0.13.1
Expand All @@ -131,17 +130,17 @@ sympy==1.14.0
tiktoken==0.12.0
tokenizers==0.22.1
tomli==2.3.0
torch==2.9.0.dev20250905+cu129
torch==2.9.0+cu128
tqdm==4.67.1
transformers==4.57.0
triton==3.4.0
transformers==4.57.1
triton==3.5.0
typer==0.19.2
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.5.0
uvicorn==0.37.0
uvloop==0.21.0
watchfiles==1.1.0
uvloop==0.22.1
watchfiles==1.1.1
websockets==15.0.1
xgrammar==0.1.21
yarl==1.22.0
16 changes: 8 additions & 8 deletions .github/workflows/build_vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ permissions:

jobs:
build:
name: forge-cu129-nightly
uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
name: forge-cu128-nightly
uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@vllm-push
strategy:
fail-fast: false
with:
repository: meta-pytorch/forge
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
test-infra-ref: vllm-push
run-smoke-test: false
wheel-nightly-policy: gha_workflow_preview_build_wheels
wheel-upload-path: whl/preview/forge/
Expand All @@ -31,13 +31,13 @@ jobs:
{
"python_version": "3.10",
"gpu_arch_type": "cpu",
"gpu_arch_version": "12.9",
"desired_cuda": "cu129",
"container_image": "pytorch/manylinux2_28-builder:cuda12.9",
"gpu_arch_version": "12.8",
"desired_cuda": "cu128",
"container_image": "pytorch/manylinux2_28-builder:cuda12.8",
"package_type": "manywheel",
"build_name": "manywheel-py3_10-cuda12_9",
"build_name": "manywheel-py3_10-cuda12_8",
"validation_runner": "linux.12xlarge.memory",
"installation": "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129",
"installation": "pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128",
"channel": "nightly",
"upload_to_base_bucket": "no",
"stable_version": "2.8.0",
Expand Down
81 changes: 80 additions & 1 deletion .meta/mast/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ The `env_setup.sh` script will automatically:
chmod +x .meta/mast/env_setup.sh

# Run the setup
./.meta/mast/env_setup.sh
source .meta/mast/env_setup.sh

```

Expand All @@ -44,3 +44,82 @@ The launch script will automatically:
- Launch the MAST job with the specified config

You can run it from anywhere, and it will figure out the correct paths.


## How MAST Launcher Works

The MAST launcher uses a two-stage architecture to run training jobs:

### Stage 1: Detached Mode (Local Machine)

When you run `./.meta/mast/launch.sh`, the `main.py` script starts in **detached mode**:

1. The launcher creates a MAST job with all the worker roles (GPU hosts)
2. It also creates a special **client role** - a CPU-only role that will run inside MAST
3. The client role's entrypoint is set to `client_bootstrap.sh`
4. All CLI arguments you pass are forwarded to the client role

At this point, the job is submitted to MAST and your local script exits. Everything now runs in the cluster.

### Stage 2: Remote Mode (Inside MAST)

The `client_bootstrap.sh` script runs inside the MAST client role and:

1. Calls `main.py` again, but now with `--mode=remote`
2. In **remote mode**, the script:
- Mounts the OilFS workspace
- Initializes the provisioner to connect to worker roles
- Runs the actual training workload (e.g., GRPO)

This architecture allows the entire training workflow to run inside MAST without requiring a persistent connection from your local machine.

### Key Files

- **`main.py`**: Entry point that handles both detached and remote modes
- **`client_bootstrap.sh`**: Entrypoint for the client role in MAST
- **`launcher.py`**: Creates the MAST job specification and handles role configuration


## Managing HuggingFace Models in MAST

### The Problem: No Internet Access

MAST compute nodes cannot access the internet, which means they cannot download models directly from HuggingFace. To work around this, we store all HuggingFace models and cache data on OilFS at `/mnt/wsfuse/teamforge/hf`, which is accessible from MAST.

### Solution: Two-Step Process

You need to perform both steps below to ensure models work correctly in MAST:

#### 1. Download Model Weights to OilFS

First, download the model weights directly to the OilFS path. This should be done from a machine with internet access (like your devserver):

```bash
# Set HF_HOME to the OilFS path
export HF_HOME=/mnt/wsfuse/teamforge/hf

# Download the model (replace with your desired model)
huggingface-cli download Qwen/Qwen3-8B --local-dir /mnt/wsfuse/teamforge/hf_artifacts/qwen3_8b
```

#### 2. Hydrate the HuggingFace Cache

After downloading the weights, you need to hydrate the HuggingFace cache so that the transformers library can find the model metadata:

```bash
# Set HF_HOME to the OilFS path
export HF_HOME=/mnt/wsfuse/teamforge/hf

# Hydrate the cache for the model
python .meta/mast/hydrate_cache.py --model-id Qwen/Qwen3-8B
```

This ensures that when MAST runs with `HF_HUB_OFFLINE=1`, the transformers library can locate all necessary files from the cache.

### Directory Structure

Both cache and model files are stored under:
- **Cache**: `/mnt/wsfuse/teamforge/hf` (set via `HF_HOME`)
- **Model weights**: `/mnt/wsfuse/teamforge/hf/<model_name>`

Make sure your MAST config files point to the correct paths in `hf_artifacts`.
51 changes: 51 additions & 0 deletions .meta/mast/client_bootstrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Bootstrap script for the MAST client role
# This script sets up the environment and launches the client training script

set -eEx

LIBCUDA="/usr/local/fbcode/platform010/lib/libcuda.so"
if [ -f "$LIBCUDA" ]; then
export LIBCUDA_DIR="${LIBCUDA%/*}"
export TRITON_LIBCUDA_PATH="$LIBCUDA_DIR"
export LD_PRELOAD="$LIBCUDA:/usr/local/fbcode/platform010/lib/libnvidia-ml.so${PRELOAD_PATH:+:$PRELOAD_PATH}"
fi

# Also preload put path to torch libs as for monarch dev workflow we dont
# install it into the env so we need to make sure the binaries can find
# libtorch and friends on mast and the rpaths set during dev install will
# be wrong on mast.
export LD_LIBRARY_PATH="${CONDA_DIR}/lib:${CONDA_DIR}/lib/python3.10/site-packages/torch/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}$TORCHX_RUN_PYTHONPATH"

# shellcheck disable=SC1091
if [ -n "$CONDA_PREFIX" ]; then
echo "A conda environment is already activated: $CONDA_DEFAULT_ENV"
else
# Disable command printing to avoid log spew.
set +x
source "${CONDA_DIR}/bin/activate"
# Re-enable command printing after conda activation.
set -x
fi

if [ -z "$WORKSPACE_DIR" ] || [ ! -d "$WORKSPACE_DIR" ]; then
WORKSPACE_DIR="$CONDA_PREFIX"
fi

cd "$WORKSPACE_DIR/forge"

export WANDB_MODE=offline
export HF_HUB_OFFLINE=1
export MONARCH_HOST_MESH_V1_REMOVE_ME_BEFORE_RELEASE=1
export TORCHSTORE_RDMA_ENABLED=1
export HF_HOME=/mnt/wsfuse/teamforge/hf

# Execute the client training script with all passed arguments
exec python -X faulthandler .meta/mast/main.py "$@"
Loading