[CI] Enable GPU env in CI (mlc-ai#2476)

* [CI] Enable GPU env in CI This PR enables GPU env in ci docker/bash.sh * remove dep on tvm testing plugin
octoml · masahi · Aug 2, 2024 · May 7, 2024 · May 8, 2024 · May 8, 2024
commit 96d752ca13f75cddbf33c4723a10eace0b512b30
diff --git a/ci/bash.sh b/ci/bash.sh
@@ -47,12 +47,42 @@ else
 	COMMAND=("$@")
 fi
 
+if [[ -n ${MLC_CI_SETUP_DEPS:-} ]]; then
+    DOCKER_ENV="${DOCKER_ENV} -e MLC_CI_SETUP_DEPS=${MLC_CI_SETUP_DEPS}"
+fi
+
 # Use nvidia-docker if the container is GPU.
-if [[ ! -z $CUDA_VISIBLE_DEVICES ]]; then
-	DOCKER_ENV="${DOCKER_ENV} -e CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}"
+if [[ -n ${CUDA_VISIBLE_DEVICES:-} ]]; then
+    DOCKER_ENV="${DOCKER_ENV} -e CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}"
+    if type nvidia-docker 1> /dev/null 2> /dev/null; then
+        DOCKER_BINARY=nvidia-docker
+    else
+        DOCKER_BINARY=docker
+        DOCKER_ENV="${DOCKER_ENV} --gpus all"
+    fi
+
+    # nvidia-docker treats Vulkan as a graphics API, so we need to
+    # request passthrough of graphics APIs.  This could also be set in
+    # the Dockerfile.
+    DOCKER_ENV="${DOCKER_ENV} -e NVIDIA_DRIVER_CAPABILITIES=compute,graphics,utility"
+
+    # vulkan comaptibility
+    ICD_SEARCH_LOCATIONS=(
+        # https://github.com/KhronosGroup/Vulkan-Loader/blob/master/loader/LoaderAndLayerInterface.md#icd-discovery-on-linux
+        /usr/local/etc/vulkan/icd.d
+        /usr/local/share/vulkan/icd.d
+        /etc/vulkan/icd.d
+        /usr/share/vulkan/icd.d
+        /etc/glvnd/egl_vendor.d
+        /usr/share/glvnd/egl_vendor.d
+    )
+    for filename in $(find "${ICD_SEARCH_LOCATIONS[@]}" -name "*nvidia*.json" 2> /dev/null); do
+	DOCKER_VOLUMNS="${DOCKER_VOLUMNS} -v ${filename}:${filename}:ro"
+    done
 fi
 
 # Print arguments.
+echo "DOCKER_BINARY ${DOCKER_BINARY}"
 echo "WORKSPACE: ${WORKSPACE}"
 echo "IMAGE NAME: ${DOCKER_IMAGE_NAME}"
 echo "ENV VARIABLES: ${DOCKER_ENV}"

diff --git a/ci/jenkinsfile.groovy b/ci/jenkinsfile.groovy
@@ -17,13 +17,14 @@
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
-run_cpu = "bash ci/bash.sh mlcaidev/ci-cpu:4d61e5d -e GPU cpu"
-run_cuda = "bash ci/bash.sh mlcaidev/ci-cu121:4d61e5d -e GPU cuda-12.1"
-run_rocm = "bash ci/bash.sh mlcaidev/ci-rocm57:4d61e5d -e GPU rocm-5.7"
+run_cpu = "bash ci/bash.sh mlcaidev/ci-cpu:4d61e5d -e GPU cpu -e MLC_CI_SETUP_DEPS 1"
+run_cuda = "bash ci/bash.sh mlcaidev/ci-cu121:4d61e5d -e GPU cuda-12.1 -e MLC_CI_SETUP_DEPS 1"
+run_rocm = "bash ci/bash.sh mlcaidev/ci-rocm57:4d61e5d -e GPU rocm-5.7 -e MLC_CI_SETUP_DEPS 1"
+
+pkg_cpu = "bash ci/bash.sh mlcaidev/package-rocm57:561ceee -e GPU cpu -e MLC_CI_SETUP_DEPS 1"
+pkg_cuda = "bash ci/bash.sh mlcaidev/package-cu121:561ceee -e GPU cuda-12.1 -e MLC_CI_SETUP_DEPS 1"
+pkg_rocm = "bash ci/bash.sh mlcaidev/package-rocm57:561ceee -e GPU rocm-5.7 -e MLC_CI_SETUP_DEPS 1"
 
-pkg_cpu = "bash ci/bash.sh mlcaidev/package-rocm57:561ceee -e GPU cpu"
-pkg_cuda = "bash ci/bash.sh mlcaidev/package-cu121:561ceee -e GPU cuda-12.1"
-pkg_rocm = "bash ci/bash.sh mlcaidev/package-rocm57:561ceee -e GPU rocm-5.7"
 
 def per_exec_ws(folder) {
   return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
@@ -176,6 +177,22 @@ stage('Build') {
   )
 }
 
+stage('Unittest') {
+  parallel(
+    'CUDA': {
+      node('GPU') {
+        ws(per_exec_ws('mlc-llm-unittest')) {
+          init_git(false)
+          sh(script: "ls -alh", label: 'Show work directory')
+          unpack_lib('mlc_wheel_cuda', 'wheels/*.whl')
+          sh(script: "${run_cuda} conda env export --name ci-unittest", label: 'Checkout version')
+          sh(script: "${run_cuda} conda run -n ci-unittest ./ci/task/test_unittest.sh", label: 'Testing')
+        }
+      }
+    }
+  )
+}
+
 stage('Model Compilation') {
   parallel(
     'CUDA': {

diff --git a/ci/task/pylint.sh b/ci/task/pylint.sh
@@ -6,9 +6,12 @@ set -x
 : ${GPU:="cpu"}
 export PYTHONPATH="./python":${PYTHONPATH:-""}
 
-# TVM Unity is a dependency to this testing
-pip install --quiet --pre -U -f https://mlc.ai/wheels mlc-ai-nightly
-pip install --quiet --pre -U cuda-python
+if [[ -n ${MLC_CI_SETUP_DEPS:-} ]]; then
+    echo "MLC_CI_SETUP_DEPS=1 start setup deps"
+    # TVM Unity is a dependency to this testing
+    pip install --quiet --pre -U -f https://mlc.ai/wheels mlc-ai-nightly
+    pip install --quiet --pre -U cuda-python
+fi
 
 pylint --jobs $NUM_THREADS ./python/
 pylint --jobs $NUM_THREADS --recursive=y ./tests/python/
diff --git a/ci/task/test_unittest.sh b/ci/task/test_unittest.sh
@@ -2,6 +2,16 @@
 set -eo pipefail
 set -x
 
+# this scripts only triggers in CI_ENV where these environment variable are passed
+if [[ -n ${MLC_CI_SETUP_DEPS:-} ]]; then
+    echo "MLC_CI_SETUP_DEPS=1 start setup deps.."
+    # Install dependency
+    pip install --force-reinstall wheels/*.whl
+    pip install --quiet pytest
+    pip install --pre -U -f https://mlc.ai/wheels mlc-ai-nightly-cu121
+    export LD_LIBRARY_PATH=/usr/local/cuda/compat/:$LD_LIBRARY_PATH
+fi
+
 # run all tests that are categorized as "unittest"
 # add pytestmarker = [pytest.mark.unittest] in the test file
 # so they will be run here

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
@@ -16,9 +16,6 @@
 # under the License.
 # pylint: disable=missing-module-docstring,unused-import
 import pytest
-import tvm.testing
-
-pytest_plugins = ["tvm.testing.plugin"]
 
 
 def pytest_configure(config):