pytorch · malfet · Apr 19, 2024 · Apr 7, 2024 · Apr 8, 2024 · Apr 12, 2024
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
@@ -13,6 +13,7 @@ on:
  - .github/workflows/build-manywheel-images.yml
  - manywheel/Dockerfile
  - manywheel/Dockerfile_aarch64
+ - manywheel/Dockerfile_cuda_aarch64
  - manywheel/Dockerfile_cxx11-abi
  - manywheel/build_docker.sh
  - 'common/*'
@@ -21,6 +22,7 @@ on:
  - .github/workflows/build-manywheel-images.yml
  - manywheel/Dockerfile
  - manywheel/Dockerfile_aarch64
+ - manywheel/Dockerfile_cuda_aarch64
  - manywheel/Dockerfile_cxx11-abi
  - 'common/*'
  - manywheel/build_docker.sh
@@ -54,6 +56,25 @@ jobs:
  - name: Build Docker Image
  run: |
  manywheel/build_docker.sh
+ build-docker-cuda-aarch64:
+ runs-on: linux.arm64.2xlarge
+ strategy:
+ matrix:
+ cuda_version: ["12.4"]
+ env:
+ GPU_ARCH_TYPE: cuda-aarch64
+ GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+ steps:
+ - name: Checkout PyTorch
+ uses: actions/checkout@v3
+ - name: Authenticate if WITH_PUSH
+ run: |
+ if [[ "${WITH_PUSH}" == true ]]; then
+ echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+ fi
+ - name: Build Docker Image
+ run: |
+ manywheel/build_docker.sh
  build-docker-rocm:
  runs-on: linux.12xlarge
  strategy:

diff --git a/aarch64_linux/aarch64_ci_build.sh b/aarch64_linux/aarch64_ci_build.sh
@@ -26,4 +26,10 @@ cd /
 git config --global --add safe.directory /pytorch
 pip install -r /pytorch/requirements.txt
 pip install auditwheel
-python /builder/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+if [ -n "$GPU_ARCH_VERSION" ]; then
+ echo "BASE_CUDA_VERSION is set to: $GPU_ARCH_VERSION"
+ python /builder/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+else
+ echo "BASE_CUDA_VERSION is not set."
+ python /builder/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+fi
diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py
@@ -9,103 +9,201 @@
 
 
 def list_dir(path: str) -> List[str]:
- ''''
+ """'
  Helper for getting paths for Python
- '''
+ """
  return check_output(["ls", "-1", path]).decode().split("\n")
 
 
 def build_ArmComputeLibrary() -> None:
- '''
+ """
  Using ArmComputeLibrary for aarch64 PyTorch
- '''
- print('Building Arm Compute Library')
- acl_build_flags=["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0",
- "arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"]
- acl_install_dir="/acl"
- acl_checkout_dir="ComputeLibrary"
+ """
+ print("Building Arm Compute Library")
+ acl_build_flags = [
+ "debug=0",
+ "neon=1",
+ "opencl=0",
+ "os=linux",
+ "openmp=1",
+ "cppthreads=0",
+ "arch=armv8a",
+ "multi_isa=1",
+ "fixed_format_kernels=1",
+ "build=native",
+ ]
+ acl_install_dir = "/acl"
+ acl_checkout_dir = "ComputeLibrary"
  os.makedirs(acl_install_dir)
- check_call(["git", "clone", "https://github.com/ARM-software/ComputeLibrary.git", "-b", "v23.08",
- "--depth", "1", "--shallow-submodules"])
- check_call(["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"] + acl_build_flags,
- cwd=acl_checkout_dir)
+ check_call(
+ [
+ "git",
+ "clone",
+ "https://github.com/ARM-software/ComputeLibrary.git",
+ "-b",
+ "v23.08",
+ "--depth",
+ "1",
+ "--shallow-submodules",
+ ]
+ )
+ check_call(
+ ["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
+ + acl_build_flags,
+ cwd=acl_checkout_dir,
+ )
  for d in ["arm_compute", "include", "utils", "support", "src"]:
  shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
 
 
+def update_wheel(wheel_path) -> None:
+ """
+ Update the cuda wheel libraries
+ """
+ folder = os.path.dirname(wheel_path)
+ wheelname = os.path.basename(wheel_path)
+ os.mkdir(f"{folder}/tmp")
+ os.system(f"unzip {wheel_path} -d {folder}/tmp")
+ libs_to_copy = [
+ "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+ "/usr/local/cuda/lib64/libcudnn.so.8",
+ "/usr/local/cuda/lib64/libcublas.so.12",
+ "/usr/local/cuda/lib64/libcublasLt.so.12",
+ "/usr/local/cuda/lib64/libcudart.so.12",
+ "/usr/local/cuda/lib64/libcufft.so.11",
+ "/usr/local/cuda/lib64/libcusparse.so.12",
+ "/usr/local/cuda/lib64/libcusparseLt.so.0",
+ "/usr/local/cuda/lib64/libcusolver.so.11",
+ "/usr/local/cuda/lib64/libcurand.so.10",
+ "/usr/local/cuda/lib64/libnvToolsExt.so.1",
+ "/usr/local/cuda/lib64/libnvJitLink.so.12",
+ "/usr/local/cuda/lib64/libnvrtc.so.12",
+ "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.4",
+ "/usr/local/cuda/lib64/libcudnn_adv_infer.so.8",
+ "/usr/local/cuda/lib64/libcudnn_adv_train.so.8",
+ "/usr/local/cuda/lib64/libcudnn_cnn_infer.so.8",
+ "/usr/local/cuda/lib64/libcudnn_cnn_train.so.8",
+ "/usr/local/cuda/lib64/libcudnn_ops_infer.so.8",
+ "/usr/local/cuda/lib64/libcudnn_ops_train.so.8",
+ "/opt/conda/envs/aarch64_env/lib/libopenblas.so.0",
+ "/opt/conda/envs/aarch64_env/lib/libgfortran.so.5",
+ "/opt/conda/envs/aarch64_env/lib/libgomp.so.1",
+ "/acl/build/libarm_compute.so",
+ "/acl/build/libarm_compute_graph.so",
+ "/acl/build/libarm_compute_core.so",
+ ]
+ # Copy libraries to unzipped_folder/a/lib
+ for lib_path in libs_to_copy:
+ lib_name = os.path.basename(lib_path)
+ shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
+ os.system(
+ f"cd {folder}/tmp/torch/lib/; patchelf --set-rpath '$ORIGIN' {folder}/tmp/torch/lib/libtorch_cuda.so"
+ )
+ os.mkdir(f"{folder}/cuda_wheel")
+ os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
+ shutil.move(
+ f"{folder}/cuda_wheel/{wheelname}",
+ f"/dist/{wheelname}",
+ copy_function=shutil.copy2,
+ )
+ os.system(f"rm -rf {folder}/tmp {folder}/dist/cuda_wheel/")
+
+
 def complete_wheel(folder: str) -> str:
- '''
+ """
  Complete wheel build and put in artifact location
- '''
+ """
  wheel_name = list_dir(f"/{folder}/dist")[0]
 
- if "pytorch" in folder:
+ if "pytorch" in folder and not enable_cuda:
  print("Repairing Wheel with AuditWheel")
- check_call(["auditwheel","repair", f"dist/{wheel_name}"], cwd=folder)
+ check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
  repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
 
  print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
- os.rename(f"/{folder}/wheelhouse/{repaired_wheel_name}", f"/{folder}/dist/{repaired_wheel_name}")
+ os.rename(
+ f"/{folder}/wheelhouse/{repaired_wheel_name}",
+ f"/{folder}/dist/{repaired_wheel_name}",
+ )
  else:
  repaired_wheel_name = wheel_name
 
- print(f"Copying {repaired_wheel_name} to artfacts")
- shutil.copy2(f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}")
+ print(f"Copying {repaired_wheel_name} to artifacts")
+ shutil.copy2(
+ f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
+ )
 
  return repaired_wheel_name
 
 
 def parse_arguments():
- '''
+ """
  Parse inline arguments
- '''
+ """
  from argparse import ArgumentParser
+
  parser = ArgumentParser("AARCH64 wheels python CD")
  parser.add_argument("--debug", action="store_true")
  parser.add_argument("--build-only", action="store_true")
  parser.add_argument("--test-only", type=str)
  parser.add_argument("--enable-mkldnn", action="store_true")
+ parser.add_argument("--enable-cuda", action="store_true")
  return parser.parse_args()
 
 
-if __name__ == '__main__':
- '''
+if __name__ == "__main__":
+ """
  Entry Point
- '''
+ """
  args = parse_arguments()
  enable_mkldnn = args.enable_mkldnn
- repo = Repository('/pytorch')
+ enable_cuda = args.enable_cuda
+ repo = Repository("/pytorch")
  branch = repo.head.name
- if branch == 'HEAD':
- branch = 'master'
-
+ if branch == "HEAD":
+ branch = "master"
 
- print('Building PyTorch wheel')
+ print("Building PyTorch wheel")
  build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
  os.system("python setup.py clean")
 
  override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
  if override_package_version is not None:
  version = override_package_version
- build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
- elif branch in ['nightly', 'master']:
- build_date = check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '')
- version = check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2]
+ build_vars += (
+ f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
+ )
+ elif branch in ["nightly", "master"]:
+ build_date = (
+ check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
+ .decode()
+ .replace("-", "")
+ )
+ version = (
+ check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
+ )
  build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
  elif branch.startswith(("v1.", "v2.")):
  build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
 
  if enable_mkldnn:
  build_ArmComputeLibrary()
  print("build pytorch with mkldnn+acl backend")
- build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON " \
- "ACL_ROOT_DIR=/acl " \
- "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH " \
- "ACL_INCLUDE_DIR=/acl/build " \
- "ACL_LIBRARY=/acl/build "
+ build_vars += (
+ "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+ "ACL_ROOT_DIR=/acl "
+ "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
+ "ACL_INCLUDE_DIR=/acl/build "
+ "ACL_LIBRARY=/acl/build "
+ )
  else:
  print("build pytorch without mkldnn backend")
 
  os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
- pytorch_wheel_name = complete_wheel("pytorch")
- print(f"Build Compelete. Created {pytorch_wheel_name}..")
+ if enable_cuda:
+ print("Updating Cuda Dependency")
+ filename = os.listdir("/pytorch/dist/")
+ wheel_path = f"/pytorch/dist/{filename[0]}"
+ update_wheel(wheel_path)
+ pytorch_wheel_name = complete_wheel("/pytorch/")
+ print(f"Build Complete. Created {pytorch_wheel_name}..")
diff --git a/common/install_cuda_aarch64.sh b/common/install_cuda_aarch64.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+set -ex
+
+function install_cusparselt_052 {
+ # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+ mkdir tmp_cusparselt && pushd tmp_cusparselt
+ wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+ tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+ cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
+ cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+ popd
+ rm -rf tmp_cusparselt
+}
+
+function install_124 {
+ echo "Installing CUDA 12.4 and cuDNN 8.9 and NCCL 2.20.5 and cuSparseLt-0.5.2"
+ rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+ # install CUDA 12.4.0 in the same container
+ wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux_sbsa.run
+ chmod +x cuda_12.4.0_550.54.14_linux_sbsa.run
+ ./cuda_12.4.0_550.54.14_linux_sbsa.run --toolkit --silent
+ rm -f cuda_12.4.0_550.54.14_linux_sbsa.run
+ rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+ # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+ mkdir tmp_cudnn && cd tmp_cudnn
+ wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.2.26_cuda12-archive.tar.xz -O cudnn-linux-sbsa-8.9.2.26_cuda12-archive.tar.xz
+ tar xf cudnn-linux-sbsa-8.9.2.26_cuda12-archive.tar.xz
+ cp -a cudnn-linux-sbsa-8.9.2.26_cuda12-archive/include/* /usr/local/cuda/include/
+ cp -a cudnn-linux-sbsa-8.9.2.26_cuda12-archive/lib/* /usr/local/cuda/lib64/
+ cd ..
+ rm -rf tmp_cudnn
+
+ # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+ # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+ git clone -b v2.20.5-1 --depth 1 https://github.com/NVIDIA/nccl.git
+ cd nccl && make -j src.build
+ cp -a build/include/* /usr/local/cuda/include/
+ cp -a build/lib/* /usr/local/cuda/lib64/
+ cd ..
+ rm -rf nccl
+
+ install_cusparselt_052
+
+ ldconfig
+}
+
+function prune_124 {
+ echo "Pruning CUDA 12.4"
+ #####################################################################################
+ # CUDA 12.4 prune static libs
+ #####################################################################################
+ export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+ export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+ export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+ export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+ if [[ -n "$OVERRIDE_GENCODE" ]]; then
+ export GENCODE=$OVERRIDE_GENCODE
+ fi
+
+ # all CUDA libs except CuDNN and CuBLAS
+ ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
+ | xargs -I {} bash -c \
+ "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+ # prune CuDNN and CuBLAS
+ $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+ $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+ #####################################################################################
+ # CUDA 12.1 prune visual tools
+ #####################################################################################
+ export CUDA_BASE="/usr/local/cuda-12.4/"
+ rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+ case "$1" in
+ 12.4) install_124; prune_124
+ ;;
+ *) echo "bad argument $1"; exit 1
+ ;;
+ esac
+ shift
+done