[vlm] Add light-weight CI for experimental models (#1848)

wwwjn · web-flow · commit 06bbcd20a615 · 2025-10-12T13:32:27.000-07:00
Add one light-weight CI for VLM
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
@@ -42,6 +42,7 @@ install_pip_dependencies() {
   pip_install -r /opt/conda/requirements-dev.txt
   pip_install -r /opt/conda/requirements.txt
   pip_install -r /opt/conda/requirements-flux.txt
+  pip_install -r /opt/conda/requirements-vlm.txt
   popd
 }
 
diff --git a/.ci/docker/requirements-vlm.txt b/.ci/docker/requirements-vlm.txt
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -32,10 +32,11 @@ ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 COPY requirements-dev.txt /opt/conda/
 COPY requirements.txt /opt/conda/
 COPY requirements-flux.txt /opt/conda/
+COPY requirements-vlm.txt /opt/conda/
 COPY conda-env-ci.txt /opt/conda/
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/utils.sh utils.sh
-RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt
+RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/requirements-vlm.txt /opt/conda/conda-env-ci.txt
 
 USER ci-user
 CMD ["bash"]
diff --git a/.github/workflows/integration_test_8gpu_flux.yaml b/.github/workflows/integration_test_8gpu_flux.yaml
@@ -8,9 +8,7 @@ on:
   pull_request:
     paths:
       - 'torchtitan/experiments/flux/**'
-  schedule:
-    # Runs every 6 hours
-    - cron: '0 */6 * * *'
+
 concurrency:
   group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml
@@ -6,6 +6,7 @@ on:
     paths-ignore:
       - 'torchtitan/experiments/**'
   pull_request:
+    branches: [ main ]
     paths-ignore:
       - 'torchtitan/experiments/**'
   schedule:
diff --git a/.github/workflows/integration_test_8gpu_vlm.yaml b/.github/workflows/integration_test_8gpu_vlm.yaml
@@ -0,0 +1,50 @@
+name: 8 GPU Vision Language Model Tests
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'torchtitan/experiments/vlm/**'
+  pull_request:
+    paths:
+      - 'torchtitan/experiments/vlm/**'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        mkdir artifacts-to-be-uploaded
+        python -m torchtitan.experiments.vlm.tests.integration_tests artifacts-to-be-uploaded --ngpu 4
diff --git a/tests/integration_tests/run_tests.py b/tests/integration_tests/run_tests.py
@@ -106,7 +106,7 @@ def main():
         "--test_suite",
         default="features",
         choices=["features", "models", "h100"],
-        help="Which test suite to run. If not specified, torchtitan composibility tests will be run",
+        help="Which test suite to run. If not specified, torchtitan composability tests will be run",
     )
     parser.add_argument(
         "--config_path",
diff --git a/torchtitan/experiments/vlm/datasets/mm_datasets.py b/torchtitan/experiments/vlm/datasets/mm_datasets.py
@@ -199,6 +199,14 @@ def _process_cc12_wd_sample(
         loader=lambda path: load_dataset(path, split="train", streaming=True),
         sample_processor=_process_cc12_wd_sample,
     ),
+    "cc12m-test": DatasetConfig(
+        # TODO: move test cc12m dataset to core test folder
+        path="torchtitan/experiments/flux/tests/assets/cc12m_test",
+        loader=lambda path: load_dataset(
+            path, split="train", data_files={"train": "*.tar"}, streaming=True
+        ),
+        sample_processor=_process_cc12_wd_sample,
+    ),
 }
 
 
diff --git a/torchtitan/experiments/vlm/requirements-vlm.txt b/torchtitan/experiments/vlm/requirements-vlm.txt
@@ -0,0 +1 @@
+../../../.ci/docker/requirements-vlm.txt
diff --git a/torchtitan/experiments/vlm/tests/integration_tests.py b/torchtitan/experiments/vlm/tests/integration_tests.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+from tests.integration_tests import OverrideDefinitions
+from tests.integration_tests.run_tests import run_tests
+
+
+def build_vlm_test_list() -> list[OverrideDefinitions]:
+    """
+    key is the config file name and value is a list of OverrideDefinitions
+    that is used to generate variations of integration tests based on the
+    same root config file.
+    """
+    integration_tests_flavors = [
+        OverrideDefinitions(
+            [
+                [
+                    "--experimental.custom_args_module torchtitan.experiments.vlm.assets.job_config",
+                    "--model.name vlm",
+                    "--training.dataset cc12m-test",
+                    "--parallelism.data_parallel_shard_degree 4",
+                    "--data.max_patches_per_image 1024",
+                    "--data.max_images_per_batch 64",
+                ],
+            ],
+            "VLM FSDP",
+            "vlm_fsdp",
+            ngpu=4,
+        ),
+    ]
+    return integration_tests_flavors
+
+
+_TEST_SUITES_FUNCTION = {
+    "vlm": build_vlm_test_list,
+}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("output_dir")
+    parser.add_argument(
+        "--config_path",
+        default="./tests/integration_tests/base_config.toml",
+        help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
+    )
+    parser.add_argument(
+        "--test_name",
+        default="all",
+        help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
+    )
+    parser.add_argument("--ngpu", default=8, type=int)
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    if os.listdir(args.output_dir):
+        raise RuntimeError("Please provide an empty output directory.")
+
+    test_list = _TEST_SUITES_FUNCTION["vlm"]()
+    run_tests(args, test_list)
+
+
+if __name__ == "__main__":
+    main()

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,7 @@ install_pip_dependencies() {`
`42`	`42`	`pip_install -r /opt/conda/requirements-dev.txt`
`43`	`43`	`pip_install -r /opt/conda/requirements.txt`
`44`	`44`	`pip_install -r /opt/conda/requirements-flux.txt`
	`45`	`+ pip_install -r /opt/conda/requirements-vlm.txt`
`45`	`46`	`popd`
`46`	`47`	`}`
`47`	`48`
Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ def main():`
`106`	`106`	`"--test_suite",`
`107`	`107`	`default="features",`
`108`	`108`	`choices=["features", "models", "h100"],`
`109`		`- help="Which test suite to run. If not specified, torchtitan composibility tests will be run",`
	`109`	`+ help="Which test suite to run. If not specified, torchtitan composability tests will be run",`
`110`	`110`	`)`
`111`	`111`	`parser.add_argument(`
`112`	`112`	`"--config_path",`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../.ci/docker/requirements-vlm.txt`