diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index 48a6ca60a..777acdcf9 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -3,7 +3,7 @@ name: Build manywheel docker images on: push: branches: - main + - main paths: - .github/workflows/build-manywheel-images.yml - manywheel/Dockerfile diff --git a/common/install_cache.sh b/common/install_cache.sh new file mode 100644 index 000000000..3da5c86c8 --- /dev/null +++ b/common/install_cache.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +set -ex + +install_ubuntu() { + echo "Preparing to build sccache from source" + apt-get update + # libssl-dev will not work as it is upgraded to libssl3 in Ubuntu-22.04. + # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh`` + apt-get install -y cargo + echo "Checking out sccache repo" + # TODO: https://github.com/pytorch/sccache is very outdated, so let's take + # a note here to update it later with the latest code from upstream + git clone https://github.com/pytorch/sccache + cd sccache + echo "Building sccache" + cargo build --release + cp target/release/sccache /opt/cache/bin + echo "Cleaning up" + cd .. + rm -rf sccache + apt-get remove -y cargo rustc + apt-get autoclean && apt-get clean +} + +install_centos() { + # Install sccache from source to get the version that supports NVCC + echo "Preparing to build sccache from source" + yum install -y cargo openssl-devel + + echo "Download sccache 0.3.0" + wget https://github.com/mozilla/sccache/archive/refs/tags/v0.3.0.tar.gz + tar xfz v0.3.0.tar.gz + + cd sccache-0.3.0 + echo "Building sccache" + cargo build --release + cp target/release/sccache /opt/cache/bin + + echo "Cleaning up" + cd .. + rm -rf sccache + yum remove -y cargo rustc +} + +install_binary() { + echo "Downloading sccache binary from S3 repo" + curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /opt/cache/bin/sccache +} + +mkdir -p /opt/cache/bin +mkdir -p /opt/cache/lib +sed -e 's|PATH="\(.*\)"|PATH="/opt/cache/bin:\1"|g' -i /etc/environment +export PATH="/opt/cache/bin:$PATH" + +# Setup compiler cache +if [ -n "$ROCM_VERSION" ]; then + curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache +else + ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') + case "$ID" in + ubuntu) + install_ubuntu + ;; + centos) + install_centos + ;; + *) + install_binary + ;; + esac +fi +chmod a+x /opt/cache/bin/sccache + +function write_sccache_stub() { + # Unset LD_PRELOAD for ps because of asan + ps issues + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589 + printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n exec sccache $(which $1) \"\$@\"\nelse\n exec $(which $1) \"\$@\"\nfi" > "/opt/cache/bin/$1" + chmod a+x "/opt/cache/bin/$1" +} + +write_sccache_stub cc +write_sccache_stub c++ +write_sccache_stub gcc +write_sccache_stub g++ + +# NOTE: See specific ROCM_VERSION case below. +if [ "x$ROCM_VERSION" = x ]; then + write_sccache_stub clang + write_sccache_stub clang++ +fi + +if [ -n "$CUDA_VERSION" ]; then + # TODO: This is a workaround for the fact that PyTorch's FindCUDA + # implementation cannot find nvcc if it is setup this way, because it + # appears to search for the nvcc in PATH, and use its path to infer + # where CUDA is installed. Instead, we install an nvcc symlink outside + # of the PATH, and set CUDA_NVCC_EXECUTABLE so that we make use of it. + + write_sccache_stub nvcc + mv /opt/cache/bin/nvcc /opt/cache/lib/ +fi + +if [ -n "$ROCM_VERSION" ]; then + # ROCm compiler is hcc or clang. However, it is commonly invoked via hipcc wrapper. + # hipcc will call either hcc or clang using an absolute path starting with /opt/rocm, + # causing the /opt/cache/bin to be skipped. We must create the sccache wrappers + # directly under /opt/rocm while also preserving the original compiler names. + # Note symlinks will chain as follows: [hcc or clang++] -> clang -> clang-?? + # Final link in symlink chain must point back to original directory. + + # Original compiler is moved one directory deeper. Wrapper replaces it. + function write_sccache_stub_rocm() { + OLDCOMP=$1 + COMPNAME=$(basename $OLDCOMP) + TOPDIR=$(dirname $OLDCOMP) + WRAPPED="$TOPDIR/original/$COMPNAME" + mv "$OLDCOMP" "$WRAPPED" + printf "#!/bin/sh\nexec sccache $WRAPPED \"\$@\"" > "$OLDCOMP" + chmod a+x "$OLDCOMP" + } + + if [[ -e "/opt/rocm/hcc/bin/hcc" ]]; then + # ROCm 3.3 or earlier. + mkdir /opt/rocm/hcc/bin/original + write_sccache_stub_rocm /opt/rocm/hcc/bin/hcc + write_sccache_stub_rocm /opt/rocm/hcc/bin/clang + write_sccache_stub_rocm /opt/rocm/hcc/bin/clang++ + # Fix last link in symlink chain, clang points to versioned clang in prior dir + pushd /opt/rocm/hcc/bin/original + ln -s ../$(readlink clang) + popd + elif [[ -e "/opt/rocm/llvm/bin/clang" ]]; then + # ROCm 3.5 and beyond. + mkdir /opt/rocm/llvm/bin/original + write_sccache_stub_rocm /opt/rocm/llvm/bin/clang + write_sccache_stub_rocm /opt/rocm/llvm/bin/clang++ + # Fix last link in symlink chain, clang points to versioned clang in prior dir + pushd /opt/rocm/llvm/bin/original + ln -s ../$(readlink clang) + popd + else + echo "Cannot find ROCm compiler." + exit 1 + fi +fi diff --git a/manywheel/Dockerfile b/manywheel/Dockerfile index 8ddee1dab..17b3fd887 100644 --- a/manywheel/Dockerfile +++ b/manywheel/Dockerfile @@ -150,6 +150,12 @@ RUN yum install -y cmake3 && \ RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-5.el7.noarch.rpm RUN yum install -y ninja-build +# Install compiler cache (do this last, so we get priority in PATH) +COPY ./common/install_cache.sh install_cache.sh +ENV PATH /opt/cache/bin:$PATH +RUN bash ./install_cache.sh && rm install_cache.sh +ENV CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache + FROM cpu_final as cuda_final RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} @@ -173,3 +179,9 @@ RUN yum install -y cmake3 && \ ln -s /usr/bin/cmake3 /usr/bin/cmake ADD ./common/install_miopen.sh install_miopen.sh RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh + +# Install compiler cache (do this last, so we get priority in PATH) +COPY ./common/install_cache.sh install_cache.sh +ENV PATH /opt/cache/bin:$PATH +RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_cache.sh && rm install_cache.sh +ENV CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache diff --git a/manywheel/build_common.sh b/manywheel/build_common.sh index c213145ef..1e2044c91 100644 --- a/manywheel/build_common.sh +++ b/manywheel/build_common.sh @@ -4,6 +4,60 @@ set -ex SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" +# Courtesy of pytorch/.jenkins/pytorch/common_utils.sh +# +# - 1st arg: code to add +# - remaining args: names of traps to modify +# +trap_add() { + trap_add_cmd=$1; shift || fatal "${FUNCNAME[0]} usage error" + for trap_add_name in "$@"; do + trap -- "$( + # helper fn to get existing trap command from output + # of trap -p + extract_trap_cmd() { printf '%s\n' "$3"; } + # print existing trap command with newline + eval "extract_trap_cmd $(trap -p "${trap_add_name}")" + # print the new trap command + printf '%s\n' "${trap_add_cmd}" + )" "${trap_add_name}" \ + || fatal "unable to add to trap ${trap_add_name}" + done +} +# set the trace attribute for the above function. this is +# required to modify DEBUG or RETURN traps because functions don't +# inherit them unless the trace attribute is set +declare -f -t trap_add + +# Initialize sccache +if [[ -n "$SCCACHE_BUCKET" ]] && which sccache > /dev/null; then + # Save sccache logs to file + sccache --stop-server > /dev/null 2>&1 || true + rm -f ~/sccache_error.log || true + + export SCCACHE_IDLE_TIMEOUT=1200 + export SCCACHE_ERROR_LOG=~/sccache_error.log + export RUST_LOG=sccache::server=error + + # Report sccache stats for easier debugging + sccache --zero-stats + function sccache_epilogue() { + sccache --show-stats + sccache --stop-server || true + } + + trap_add sccache_epilogue EXIT +else + # Not using sscache if it's not setup properly + rm -f /opt/cache/bin/cc + rm -f /opt/cache/bin/c++ + rm -f /opt/cache/bin/clang + rm -f /opt/cache/bin/clang++ + rm -f /opt/cache/bin/gcc + rm -f /opt/cache/bin/g++ + + unset CMAKE_CUDA_COMPILER_LAUNCHER +fi # Require only one python installation if [[ -z "$DESIRED_PYTHON" ]]; then