diff --git a/.github/workflows/publish_manylinux_image.yml b/.github/workflows/publish_manylinux_image.yml index fd3a49a8..cfcf63d1 100644 --- a/.github/workflows/publish_manylinux_image.yml +++ b/.github/workflows/publish_manylinux_image.yml @@ -37,16 +37,16 @@ jobs: tags: | vectorchai/scalellm_manylinux:cuda12.1 - # - name: Build base for cuda 11.8 - # uses: docker/build-push-action@v5 - # with: - # context: ./docker - # file: ./docker/Dockerfile.manylinux - # push: true - # cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache - # cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache - # build-args: | - # CUDA_VERSION=11.8 - # tags: | - # vectorchai/scalellm_manylinux:cuda11.8 + - name: Build base for cuda 11.8 + uses: docker/build-push-action@v5 + with: + context: ./docker + file: ./docker/Dockerfile.manylinux_gcc11 + push: true + cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache + cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache + build-args: | + CUDA_VERSION=11.8 + tags: | + vectorchai/scalellm_manylinux:cuda11.8 diff --git a/.github/workflows/release_wheel.yml b/.github/workflows/release_wheel.yml index 32ac47a7..660f2f40 100644 --- a/.github/workflows/release_wheel.yml +++ b/.github/workflows/release_wheel.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python: ["3.9", "3.10", "3.11"] - cuda: ["12.1"] + cuda: ["11.8", "12.1"] torch: ["2.2", "2.3"] runs-on: [self-hosted, linux, release] steps: @@ -27,7 +27,7 @@ jobs: - name: Build wheel run: | - docker pull vectorchai/scalellm_builder:cuda${{ matrix.cuda }}-ubuntu22.04 + docker pull vectorchai/scalellm_manylinux:cuda${{ matrix.cuda }} docker run --rm -t \ -v "$CI_CACHE_DIR":/ci_cache \ -v "$GITHUB_WORKSPACE":/ScaleLLM \ diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index d3bc5df7..cf53d463 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -23,7 +23,7 @@ RUN rm install_python.sh # Install cuda, cudnn and nccl ARG CUDA_VERSION=12.1 -RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh +COPY ./common/install_cuda.sh install_cuda.sh RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh ENV DESIRED_CUDA ${CUDA_VERSION} ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH diff --git a/docker/Dockerfile.devel b/docker/Dockerfile.devel index c4caca89..8e11c174 100644 --- a/docker/Dockerfile.devel +++ b/docker/Dockerfile.devel @@ -9,25 +9,6 @@ ENV DEBIAN_FRONTEND noninteractive COPY ./common/install_base.sh install_base.sh RUN bash ./install_base.sh && rm install_base.sh -# Install user -COPY ./common/install_user.sh install_user.sh -RUN bash ./install_user.sh && rm install_user.sh - -# Install multiple python versions -COPY ./common/install_python.sh install_python.sh -RUN bash ./install_python.sh "3.9.0" -RUN bash ./install_python.sh "3.10.1" -RUN bash ./install_python.sh "3.11.0" -RUN bash ./install_python.sh "3.12.0" -RUN rm install_python.sh - -# Install cuda, cudnn and nccl -ARG CUDA_VERSION=12.1 -RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh -RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh -ENV DESIRED_CUDA ${CUDA_VERSION} -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH - # Install gcc ARG GCC_VERSION=12 RUN apt-get update \ @@ -36,16 +17,23 @@ RUN apt-get update \ COPY ./common/install_gcc.sh install_gcc.sh RUN bash ./install_gcc.sh && rm install_gcc.sh -ARG CMAKE_VERSION=3.18.5 +ARG CMAKE_VERSION=3.29.3 COPY ./common/install_cmake.sh install_cmake.sh RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi RUN rm install_cmake.sh -ARG NINJA_VERSION=1.9.0 +ARG NINJA_VERSION=1.11.1 COPY ./common/install_ninja.sh install_ninja.sh RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi RUN rm install_ninja.sh +# Install cuda, cudnn and nccl +ARG CUDA_VERSION=12.1 +COPY ./common/install_cuda.sh install_cuda.sh +RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh +ENV DESIRED_CUDA ${CUDA_VERSION} +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH + # install rust ENV RUSTUP_HOME=/usr/local/rustup ENV CARGO_HOME=/usr/local/cargo diff --git a/docker/Dockerfile.manylinux b/docker/Dockerfile.manylinux index 8f6c52e4..13fd4897 100644 --- a/docker/Dockerfile.manylinux +++ b/docker/Dockerfile.manylinux @@ -21,7 +21,7 @@ COPY ./common/install_cuda.sh install_cuda.sh RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH -# ARG CMAKE_VERSION=3.18.5 +# ARG CMAKE_VERSION=3.29.3 # COPY ./common/install_cmake.sh install_cmake.sh # RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi # RUN rm install_cmake.sh diff --git a/docker/Dockerfile.manylinux_gcc11 b/docker/Dockerfile.manylinux_gcc11 new file mode 100644 index 00000000..43f24bd4 --- /dev/null +++ b/docker/Dockerfile.manylinux_gcc11 @@ -0,0 +1,56 @@ +FROM quay.io/pypa/manylinux_2_28_x86_64 as base + +LABEL maintainer="mi@vectorch.com" +ENV DEBIAN_FRONTEND noninteractive + +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 + +# Install common dependencies +COPY ./common/install_base.sh install_base.sh +RUN bash ./install_base.sh && rm install_base.sh + +# Install user +COPY ./common/install_user.sh install_user.sh +RUN bash ./install_user.sh && rm install_user.sh + +# Install gcc-11 +RUN rm -rf /opt/rh/gcc-toolset-12 +RUN yum install -y gcc-toolset-11-toolchain +ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11/root/usr/lib:$LD_LIBRARY_PATH +RUN gcc --version; g++ --version + + +# Install cuda, cudnn and nccl +ARG CUDA_VERSION=11.8 +COPY ./common/install_cuda.sh install_cuda.sh +RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH + +# ARG CMAKE_VERSION=3.29.3 +# COPY ./common/install_cmake.sh install_cmake.sh +# RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi +# RUN rm install_cmake.sh + +ARG NINJA_VERSION=1.11.1 +COPY ./common/install_ninja.sh install_ninja.sh +RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi +RUN rm install_ninja.sh + +ARG CCACHE_VERSION=4.8.3 +COPY ./common/install_ccache.sh install_ccache.sh +RUN if [ -n "${CCACHE_VERSION}" ]; then bash ./install_ccache.sh; fi +RUN rm install_ccache.sh + +# install rust +ENV RUSTUP_HOME=/usr/local/rustup +ENV CARGO_HOME=/usr/local/cargo +ENV PATH=/usr/local/cargo/bin:$PATH +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y +# give everyone permission to use rust +RUN chmod -R a+w ${RUSTUP_HOME} ${CARGO_HOME} +RUN rustup --version; cargo --version; rustc --version + +CMD ["bash"] \ No newline at end of file diff --git a/docker/common/install_base.sh b/docker/common/install_base.sh old mode 100644 new mode 100755 diff --git a/docker/common/install_ccache.sh b/docker/common/install_ccache.sh old mode 100644 new mode 100755 diff --git a/docker/common/install_cmake.sh b/docker/common/install_cmake.sh old mode 100644 new mode 100755 index 26257bf1..4d94129e --- a/docker/common/install_cmake.sh +++ b/docker/common/install_cmake.sh @@ -19,13 +19,12 @@ case "$ID" in ;; esac -# Turn 3.6.3 into v3.6 -path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/') -file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz" +path="v${CMAKE_VERSION}" +file="cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" # Download and install specific CMake version in /usr/local pushd /tmp -curl -Os --retry 3 "https://cmake.org/files/${path}/${file}" -tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz +wget -q "https://github.com/Kitware/CMake/releases/download/${path}/${file}" +tar -C /usr/local --strip-components 1 --no-same-owner -zxf ${file} rm -f cmake-*.tar.gz popd \ No newline at end of file diff --git a/docker/common/install_cuda.sh b/docker/common/install_cuda.sh old mode 100644 new mode 100755 diff --git a/docker/common/install_gcc.sh b/docker/common/install_gcc.sh old mode 100644 new mode 100755 diff --git a/docker/common/install_ninja.sh b/docker/common/install_ninja.sh old mode 100644 new mode 100755 diff --git a/docker/common/install_python.sh b/docker/common/install_python.sh old mode 100644 new mode 100755 diff --git a/docker/common/install_user.sh b/docker/common/install_user.sh old mode 100644 new mode 100755 diff --git a/tools/run_in_docker.sh b/tools/run_in_docker.sh index 1485fb85..4fab30be 100755 --- a/tools/run_in_docker.sh +++ b/tools/run_in_docker.sh @@ -62,19 +62,25 @@ RUN_OPTS+=("-v $(pwd):$(pwd)") RUN_OPTS+=("-v /tmp:/tmp") RUN_OPTS+=("-v ${HOME}:${HOME}") -# carry over some environment variables -RUN_OPTS+=("-e VCPKG_DEFAULT_BINARY_CACHE=${VCPKG_DEFAULT_BINARY_CACHE}") -RUN_OPTS+=("-e CCACHE_DIR=${CCACHE_DIR}") +# carry over cache settings +if [[ -n "${VCPKG_DEFAULT_BINARY_CACHE}" ]]; then + RUN_OPTS+=("-v ${VCPKG_DEFAULT_BINARY_CACHE}:${VCPKG_DEFAULT_BINARY_CACHE}") + RUN_OPTS+=("-e VCPKG_DEFAULT_BINARY_CACHE=${VCPKG_DEFAULT_BINARY_CACHE}") +fi + +if [[ -n "${CCACHE_DIR}" ]]; then + RUN_OPTS+=("-v ${CCACHE_DIR}:${CCACHE_DIR}") + RUN_OPTS+=("-e CCACHE_DIR=${CCACHE_DIR}") +fi CMD="sh -c 'cd $(pwd); $@'" [[ "${CMD}" = "" ]] && usage [[ ! -x $(command -v docker) ]] && echo "ERROR: 'docker' command missing from PATH." && usage -echo "== Pulling docker image: ${IMAGE}" if ! docker pull ${IMAGE} ; then echo "WARNING: Failed to docker pull image ${IMAGE}" fi -echo "docker run ${RUN_OPTS[@]} ${IMAGE} bash -c \"$(get_switch_user_cmd) ${CMD}\"" +# echo "docker run ${RUN_OPTS[@]} ${IMAGE} bash -c \"$(get_switch_user_cmd) ${CMD}\"" docker run ${RUN_OPTS[@]} ${IMAGE} bash -c "$(get_switch_user_cmd) ${CMD}"