Skip to content

Commit 06bbcd2

Browse files
authored
[vlm] Add light-weight CI for experimental models (#1848)
Add one light-weight CI for VLM
1 parent a82b77a commit 06bbcd2

File tree

10 files changed

+136
-5
lines changed

10 files changed

+136
-5
lines changed

.ci/docker/common/install_conda.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ install_pip_dependencies() {
4242
pip_install -r /opt/conda/requirements-dev.txt
4343
pip_install -r /opt/conda/requirements.txt
4444
pip_install -r /opt/conda/requirements-flux.txt
45+
pip_install -r /opt/conda/requirements-vlm.txt
4546
popd
4647
}
4748

File renamed without changes.

.ci/docker/ubuntu/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,11 @@ ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH
3232
COPY requirements-dev.txt /opt/conda/
3333
COPY requirements.txt /opt/conda/
3434
COPY requirements-flux.txt /opt/conda/
35+
COPY requirements-vlm.txt /opt/conda/
3536
COPY conda-env-ci.txt /opt/conda/
3637
COPY ./common/install_conda.sh install_conda.sh
3738
COPY ./common/utils.sh utils.sh
38-
RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt
39+
RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/requirements-vlm.txt /opt/conda/conda-env-ci.txt
3940

4041
USER ci-user
4142
CMD ["bash"]

.github/workflows/integration_test_8gpu_flux.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@ on:
88
pull_request:
99
paths:
1010
- 'torchtitan/experiments/flux/**'
11-
schedule:
12-
# Runs every 6 hours
13-
- cron: '0 */6 * * *'
11+
1412
concurrency:
1513
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
1614
cancel-in-progress: true

.github/workflows/integration_test_8gpu_models.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ on:
66
paths-ignore:
77
- 'torchtitan/experiments/**'
88
pull_request:
9+
branches: [ main ]
910
paths-ignore:
1011
- 'torchtitan/experiments/**'
1112
schedule:
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
name: 8 GPU Vision Language Model Tests
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
paths:
7+
- 'torchtitan/experiments/vlm/**'
8+
pull_request:
9+
paths:
10+
- 'torchtitan/experiments/vlm/**'
11+
12+
concurrency:
13+
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
14+
cancel-in-progress: true
15+
16+
defaults:
17+
run:
18+
shell: bash -l -eo pipefail {0}
19+
20+
jobs:
21+
build-test:
22+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
23+
with:
24+
runner: linux.g5.48xlarge.nvidia.gpu
25+
gpu-arch-type: cuda
26+
gpu-arch-version: "12.6"
27+
# This image is faster to clone than the default, but it lacks CC needed by triton
28+
# (1m25s vs 2m37s).
29+
docker-image: torchtitan-ubuntu-20.04-clang12
30+
repository: pytorch/torchtitan
31+
upload-artifact: outputs
32+
script: |
33+
set -eux
34+
35+
# The generic Linux job chooses to use base env, not the one setup by the image
36+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
37+
conda activate "${CONDA_ENV}"
38+
39+
# Log CUDA driver version for debugging.
40+
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
41+
echo "CUDA driver version: ${DRIVER_VERSION}"
42+
43+
pip config --user set global.progress_bar off
44+
45+
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
46+
47+
USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
48+
49+
mkdir artifacts-to-be-uploaded
50+
python -m torchtitan.experiments.vlm.tests.integration_tests artifacts-to-be-uploaded --ngpu 4

tests/integration_tests/run_tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def main():
106106
"--test_suite",
107107
default="features",
108108
choices=["features", "models", "h100"],
109-
help="Which test suite to run. If not specified, torchtitan composibility tests will be run",
109+
help="Which test suite to run. If not specified, torchtitan composability tests will be run",
110110
)
111111
parser.add_argument(
112112
"--config_path",

torchtitan/experiments/vlm/datasets/mm_datasets.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,14 @@ def _process_cc12_wd_sample(
199199
loader=lambda path: load_dataset(path, split="train", streaming=True),
200200
sample_processor=_process_cc12_wd_sample,
201201
),
202+
"cc12m-test": DatasetConfig(
203+
# TODO: move test cc12m dataset to core test folder
204+
path="torchtitan/experiments/flux/tests/assets/cc12m_test",
205+
loader=lambda path: load_dataset(
206+
path, split="train", data_files={"train": "*.tar"}, streaming=True
207+
),
208+
sample_processor=_process_cc12_wd_sample,
209+
),
202210
}
203211

204212

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../.ci/docker/requirements-vlm.txt
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import argparse
8+
import os
9+
10+
from tests.integration_tests import OverrideDefinitions
11+
from tests.integration_tests.run_tests import run_tests
12+
13+
14+
def build_vlm_test_list() -> list[OverrideDefinitions]:
15+
"""
16+
key is the config file name and value is a list of OverrideDefinitions
17+
that is used to generate variations of integration tests based on the
18+
same root config file.
19+
"""
20+
integration_tests_flavors = [
21+
OverrideDefinitions(
22+
[
23+
[
24+
"--experimental.custom_args_module torchtitan.experiments.vlm.assets.job_config",
25+
"--model.name vlm",
26+
"--training.dataset cc12m-test",
27+
"--parallelism.data_parallel_shard_degree 4",
28+
"--data.max_patches_per_image 1024",
29+
"--data.max_images_per_batch 64",
30+
],
31+
],
32+
"VLM FSDP",
33+
"vlm_fsdp",
34+
ngpu=4,
35+
),
36+
]
37+
return integration_tests_flavors
38+
39+
40+
_TEST_SUITES_FUNCTION = {
41+
"vlm": build_vlm_test_list,
42+
}
43+
44+
45+
def main():
46+
parser = argparse.ArgumentParser()
47+
parser.add_argument("output_dir")
48+
parser.add_argument(
49+
"--config_path",
50+
default="./tests/integration_tests/base_config.toml",
51+
help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
52+
)
53+
parser.add_argument(
54+
"--test_name",
55+
default="all",
56+
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
57+
)
58+
parser.add_argument("--ngpu", default=8, type=int)
59+
args = parser.parse_args()
60+
61+
if not os.path.exists(args.output_dir):
62+
os.makedirs(args.output_dir)
63+
if os.listdir(args.output_dir):
64+
raise RuntimeError("Please provide an empty output directory.")
65+
66+
test_list = _TEST_SUITES_FUNCTION["vlm"]()
67+
run_tests(args, test_list)
68+
69+
70+
if __name__ == "__main__":
71+
main()

0 commit comments

Comments
 (0)