[CI][1.x] Cherrypick: Upgrade unix gpu toolchain (#18186) (#18785)

* Update unix gpu toolchain (#18186) * update nvidiadocker command & remove cuda compat * replace cu101 with cuda since compat is no longer to be used * skip flaky tests * get rid of ubuntu_build_cuda and point ubuntu_cu101 to base gpu instead of cuda compat * Revert "skip flaky tests" This reverts commit 1c720fa. * revert removal of ubuntu_build_cuda * add linux gpu g4 node to all steps using g3 in unix-gpu pipeline * remove docker compose files * add back the caffe test since caffe is deprecated for mx2.0 and not 1.x * drop nvidia-docker requirement since docker19.0 supports it by default :q * remove compat from dockerfile * Cherry-pick #18635 to v1.7.x (#18935) * Remove mention of nightly in pypi (#18635) * update bert dev.tsv link Co-authored-by: Sheng Zha <szha@users.noreply.github.com> * disable tvm in CI functions that rely on libcuda compat * tvm off for ubuntu_gpu_cmake build * drop tvm from all unix-gpu builds Co-authored-by: Carin Meier <cmeier@gigasquidsoftware.com> Co-authored-by: Sheng Zha <szha@users.noreply.github.com>
apache · Aug 18, 2020 · 9981e84 · 9981e84
1 parent 6ae469a
commit 9981e84
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 129 deletions.
diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
@@ -255,6 +255,7 @@ def assign_node_labels(args) {
   //    knowing about the limitations.
   NODE_LINUX_CPU = args.linux_cpu
   NODE_LINUX_GPU = args.linux_gpu
+  NODE_LINUX_GPU_G4 = args.linux_gpu_g4
   NODE_LINUX_GPU_P3 = args.linux_gpu_p3
   NODE_WINDOWS_CPU = args.windows_cpu
   NODE_WINDOWS_GPU = args.windows_gpu

diff --git a/ci/build.py b/ci/build.py
@@ -66,23 +66,18 @@ def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
     return os.path.join(path, "Dockerfile.{0}".format(platform))
 
 
-def get_docker_binary(use_nvidia_docker: bool) -> str:
-    return "nvidia-docker" if use_nvidia_docker else "docker"
-
-
-def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool,
+def build_docker(platform: str, registry: str, num_retries: int, no_cache: bool,
                  cache_intermediate: bool) -> str:
     """
     Build a container for the given platform
     :param platform: Platform
-    :param docker_binary: docker binary to use (docker/nvidia-docker)
     :param registry: Dockerhub registry name
     :param num_retries: Number of retries to build the docker image
     :param no_cache: pass no-cache to docker to rebuild the images
     :return: Id of the top level image
     """
     tag = get_docker_tag(platform=platform, registry=registry)
-    logging.info("Building docker container tagged '%s' with %s", tag, docker_binary)
+    logging.info("Building docker container tagged '%s'", tag)
     #
     # We add a user with the same group as the executing non-root user so files created in the
     # container match permissions of the local user. Same for the group.
@@ -99,7 +94,7 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
     #
     # This doesn't work with multi head docker files.
     #
-    cmd = [docker_binary, "build",
+    cmd = ["docker", "build",
            "-f", get_dockerfile(platform),
            "--build-arg", "USER_ID={}".format(os.getuid()),
            "--build-arg", "GROUP_ID={}".format(os.getgid())]
@@ -119,19 +114,19 @@ def run_cmd():
     run_cmd()
     # Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
     # check_call would have failed
-    image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
+    image_id = _get_local_image_id(docker_tag=tag)
     if not image_id:
         raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag))
     return image_id
 
 
-def _get_local_image_id(docker_binary, docker_tag):
+def _get_local_image_id(docker_tag):
     """
     Get the image id of the local docker layer with the passed tag
     :param docker_tag: docker tag
     :return: Image id as string or None if tag does not exist
     """
-    cmd = [docker_binary, "images", "-q", docker_tag]
+    cmd = ["docker", "images", "-q", docker_tag]
     image_id_b = check_output(cmd)
     image_id = image_id_b.decode('utf-8').strip()
     if not image_id:
@@ -196,8 +191,9 @@ def container_run(docker_client: SafeDockerClient,
 
     # Equivalent command
     docker_cmd_list = [
-        get_docker_binary(nvidia_runtime),
+        "docker",
         'run',
+        "--gpus all" if nvidia_runtime else "",
         "--cap-add",
         "SYS_PTRACE", # Required by ASAN
         '--rm',
@@ -352,7 +348,6 @@ def main() -> int:
     args = parser.parse_args()
 
     command = list(chain(*args.command))
-    docker_binary = get_docker_binary(args.nvidiadocker)
     docker_client = SafeDockerClient()
 
     environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
@@ -366,7 +361,7 @@ def main() -> int:
         if args.docker_registry:
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
         if not args.run_only:
-            build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
+            build_docker(platform=platform, registry=args.docker_registry,
                          num_retries=args.docker_build_retries, no_cache=args.no_cache,
                          cache_intermediate=args.cache_intermediate)
         else:
@@ -410,7 +405,7 @@ def main() -> int:
         for platform in platforms:
             tag = get_docker_tag(platform=platform, registry=args.docker_registry)
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-            build_docker(platform, docker_binary=docker_binary, registry=args.docker_registry,
+            build_docker(platform, registry=args.docker_registry,
                          num_retries=args.docker_build_retries, no_cache=args.no_cache)
             if args.build_only:
                 continue

diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu101 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu101
@@ -79,4 +79,3 @@ RUN /work/ubuntu_adduser.sh
 COPY runtime_functions.sh /work/
 
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/compat
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -767,7 +767,7 @@ build_ubuntu_gpu_mkldnn() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
-        USE_TVM_OP=1                              \
+        USE_TVM_OP=0                              \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         USE_SIGNAL_HANDLER=1                      \
         -j$(nproc)
@@ -784,7 +784,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=0                               \
-        USE_TVM_OP=1                              \
+        USE_TVM_OP=0                              \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         USE_SIGNAL_HANDLER=1                      \
         -j$(nproc)
@@ -799,7 +799,7 @@ build_ubuntu_gpu_cuda101_cudnn7() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
-        USE_TVM_OP=1                              \
+        USE_TVM_OP=0                              \
         USE_CPP_PACKAGE=1                         \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
@@ -827,26 +827,6 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
     make cython PYTHON=python3
 }
 
-build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
-    set -ex
-    build_ccache_wrappers
-    make \
-        DEV=1                                     \
-        USE_BLAS=openblas                         \
-        USE_MKLDNN=0                              \
-        USE_CUDA=1                                \
-        USE_CUDA_PATH=/usr/local/cuda             \
-        USE_CUDNN=1                               \
-        USE_TVM_OP=0                              \
-        USE_CPP_PACKAGE=1                         \
-        USE_DIST_KVSTORE=1                        \
-        CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
-        USE_SIGNAL_HANDLER=1                      \
-        -j$(nproc)
-
-    make cython PYTHON=python3
-}
-
 build_ubuntu_amalgamation() {
     set -ex
     # Amalgamation can not be run with -j nproc
@@ -874,7 +854,7 @@ build_ubuntu_gpu_cmake_mkldnn() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \
-        -DUSE_TVM_OP=1                          \
+        -DUSE_TVM_OP=0                          \
         -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKLML_MKL=1                       \
         -DCMAKE_BUILD_TYPE=Release              \
@@ -893,7 +873,7 @@ build_ubuntu_gpu_cmake() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
-        -DUSE_TVM_OP=ON                         \
+        -DUSE_TVM_OP=OFF                         \
         -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
@@ -916,7 +896,7 @@ build_ubuntu_gpu_cmake_no_rtc() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
-        -DUSE_TVM_OP=ON                         \
+        -DUSE_TVM_OP=OFF                         \
         -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
@@ -932,29 +912,6 @@ build_ubuntu_gpu_cmake_no_rtc() {
     ninja
 }
 
-build_ubuntu_gpu_cmake_no_tvm_op() {
-    set -ex
-    cd /work/build
-    build_ccache_wrappers
-    cmake \
-        -DUSE_SIGNAL_HANDLER=ON                 \
-        -DUSE_CUDA=ON                           \
-        -DUSE_CUDNN=ON                          \
-        -DUSE_TVM_OP=OFF                        \
-        -DPython3_EXECUTABLE=/usr/bin/python3   \
-        -DUSE_MKL_IF_AVAILABLE=OFF              \
-        -DUSE_MKLML_MKL=OFF                     \
-        -DUSE_MKLDNN=OFF                        \
-        -DUSE_DIST_KVSTORE=ON                   \
-        -DCMAKE_BUILD_TYPE=Release              \
-        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DBUILD_CYTHON_MODULES=1                \
-        -G Ninja                                \
-        /work/mxnet
-
-    ninja
-}
-
 build_ubuntu_cpu_large_tensor() {
     set -ex
     cd /work/build
@@ -980,7 +937,7 @@ build_ubuntu_gpu_large_tensor() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
-        -DUSE_TVM_OP=ON                         \
+        -DUSE_TVM_OP=OFF                         \
         -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \