Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix] added cuda 11.8 support for manylinux #186

Merged
merged 2 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions .github/workflows/publish_manylinux_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,16 @@ jobs:
tags: |
vectorchai/scalellm_manylinux:cuda12.1

# - name: Build base for cuda 11.8
# uses: docker/build-push-action@v5
# with:
# context: ./docker
# file: ./docker/Dockerfile.manylinux
# push: true
# cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
# cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
# build-args: |
# CUDA_VERSION=11.8
# tags: |
# vectorchai/scalellm_manylinux:cuda11.8
- name: Build base for cuda 11.8
uses: docker/build-push-action@v5
with:
context: ./docker
file: ./docker/Dockerfile.manylinux_gcc11
push: true
cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
build-args: |
CUDA_VERSION=11.8
tags: |
vectorchai/scalellm_manylinux:cuda11.8

4 changes: 2 additions & 2 deletions .github/workflows/release_wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
fail-fast: false
matrix:
python: ["3.9", "3.10", "3.11"]
cuda: ["12.1"]
cuda: ["11.8", "12.1"]
torch: ["2.2", "2.3"]
runs-on: [self-hosted, linux, release]
steps:
Expand All @@ -27,7 +27,7 @@ jobs:

- name: Build wheel
run: |
docker pull vectorchai/scalellm_builder:cuda${{ matrix.cuda }}-ubuntu22.04
docker pull vectorchai/scalellm_manylinux:cuda${{ matrix.cuda }}
docker run --rm -t \
-v "$CI_CACHE_DIR":/ci_cache \
-v "$GITHUB_WORKSPACE":/ScaleLLM \
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ RUN rm install_python.sh

# Install cuda, cudnn and nccl
ARG CUDA_VERSION=12.1
RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
COPY ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
Expand Down
30 changes: 9 additions & 21 deletions docker/Dockerfile.devel
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,6 @@ ENV DEBIAN_FRONTEND noninteractive
COPY ./common/install_base.sh install_base.sh
RUN bash ./install_base.sh && rm install_base.sh

# Install user
COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh

# Install multiple python versions
COPY ./common/install_python.sh install_python.sh
RUN bash ./install_python.sh "3.9.0"
RUN bash ./install_python.sh "3.10.1"
RUN bash ./install_python.sh "3.11.0"
RUN bash ./install_python.sh "3.12.0"
RUN rm install_python.sh

# Install cuda, cudnn and nccl
ARG CUDA_VERSION=12.1
RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

# Install gcc
ARG GCC_VERSION=12
RUN apt-get update \
Expand All @@ -36,16 +17,23 @@ RUN apt-get update \
COPY ./common/install_gcc.sh install_gcc.sh
RUN bash ./install_gcc.sh && rm install_gcc.sh

ARG CMAKE_VERSION=3.18.5
ARG CMAKE_VERSION=3.29.3
COPY ./common/install_cmake.sh install_cmake.sh
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
RUN rm install_cmake.sh

ARG NINJA_VERSION=1.9.0
ARG NINJA_VERSION=1.11.1
COPY ./common/install_ninja.sh install_ninja.sh
RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
RUN rm install_ninja.sh

# Install cuda, cudnn and nccl
ARG CUDA_VERSION=12.1
COPY ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

# install rust
ENV RUSTUP_HOME=/usr/local/rustup
ENV CARGO_HOME=/usr/local/cargo
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.manylinux
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ COPY ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

# ARG CMAKE_VERSION=3.18.5
# ARG CMAKE_VERSION=3.29.3
# COPY ./common/install_cmake.sh install_cmake.sh
# RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
# RUN rm install_cmake.sh
Expand Down
56 changes: 56 additions & 0 deletions docker/Dockerfile.manylinux_gcc11
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
FROM quay.io/pypa/manylinux_2_28_x86_64 as base

LABEL maintainer="mi@vectorch.com"
ENV DEBIAN_FRONTEND noninteractive

ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8

# Install common dependencies
COPY ./common/install_base.sh install_base.sh
RUN bash ./install_base.sh && rm install_base.sh

# Install user
COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh

# Install gcc-11
RUN rm -rf /opt/rh/gcc-toolset-12
RUN yum install -y gcc-toolset-11-toolchain
ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11/root/usr/lib:$LD_LIBRARY_PATH
RUN gcc --version; g++ --version


# Install cuda, cudnn and nccl
ARG CUDA_VERSION=11.8
COPY ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

# ARG CMAKE_VERSION=3.29.3
# COPY ./common/install_cmake.sh install_cmake.sh
# RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
# RUN rm install_cmake.sh

ARG NINJA_VERSION=1.11.1
COPY ./common/install_ninja.sh install_ninja.sh
RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
RUN rm install_ninja.sh

ARG CCACHE_VERSION=4.8.3
COPY ./common/install_ccache.sh install_ccache.sh
RUN if [ -n "${CCACHE_VERSION}" ]; then bash ./install_ccache.sh; fi
RUN rm install_ccache.sh

# install rust
ENV RUSTUP_HOME=/usr/local/rustup
ENV CARGO_HOME=/usr/local/cargo
ENV PATH=/usr/local/cargo/bin:$PATH
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
# give everyone permission to use rust
RUN chmod -R a+w ${RUSTUP_HOME} ${CARGO_HOME}
RUN rustup --version; cargo --version; rustc --version

CMD ["bash"]
Empty file modified docker/common/install_base.sh
100644 → 100755
Empty file.
Empty file modified docker/common/install_ccache.sh
100644 → 100755
Empty file.
9 changes: 4 additions & 5 deletions docker/common/install_cmake.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,12 @@ case "$ID" in
;;
esac

# Turn 3.6.3 into v3.6
path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/')
file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
path="v${CMAKE_VERSION}"
file="cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"

# Download and install specific CMake version in /usr/local
pushd /tmp
curl -Os --retry 3 "https://cmake.org/files/${path}/${file}"
tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz
wget -q "https://github.com/Kitware/CMake/releases/download/${path}/${file}"
tar -C /usr/local --strip-components 1 --no-same-owner -zxf ${file}
rm -f cmake-*.tar.gz
popd
Empty file modified docker/common/install_cuda.sh
100644 → 100755
Empty file.
Empty file modified docker/common/install_gcc.sh
100644 → 100755
Empty file.
Empty file modified docker/common/install_ninja.sh
100644 → 100755
Empty file.
Empty file modified docker/common/install_python.sh
100644 → 100755
Empty file.
Empty file modified docker/common/install_user.sh
100644 → 100755
Empty file.
16 changes: 11 additions & 5 deletions tools/run_in_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,25 @@ RUN_OPTS+=("-v $(pwd):$(pwd)")
RUN_OPTS+=("-v /tmp:/tmp")
RUN_OPTS+=("-v ${HOME}:${HOME}")

# carry over some environment variables
RUN_OPTS+=("-e VCPKG_DEFAULT_BINARY_CACHE=${VCPKG_DEFAULT_BINARY_CACHE}")
RUN_OPTS+=("-e CCACHE_DIR=${CCACHE_DIR}")
# carry over cache settings
if [[ -n "${VCPKG_DEFAULT_BINARY_CACHE}" ]]; then
RUN_OPTS+=("-v ${VCPKG_DEFAULT_BINARY_CACHE}:${VCPKG_DEFAULT_BINARY_CACHE}")
RUN_OPTS+=("-e VCPKG_DEFAULT_BINARY_CACHE=${VCPKG_DEFAULT_BINARY_CACHE}")
fi

if [[ -n "${CCACHE_DIR}" ]]; then
RUN_OPTS+=("-v ${CCACHE_DIR}:${CCACHE_DIR}")
RUN_OPTS+=("-e CCACHE_DIR=${CCACHE_DIR}")
fi

CMD="sh -c 'cd $(pwd); $@'"

[[ "${CMD}" = "" ]] && usage
[[ ! -x $(command -v docker) ]] && echo "ERROR: 'docker' command missing from PATH." && usage

echo "== Pulling docker image: ${IMAGE}"
if ! docker pull ${IMAGE} ; then
echo "WARNING: Failed to docker pull image ${IMAGE}"
fi

echo "docker run ${RUN_OPTS[@]} ${IMAGE} bash -c \"$(get_switch_user_cmd) ${CMD}\""
# echo "docker run ${RUN_OPTS[@]} ${IMAGE} bash -c \"$(get_switch_user_cmd) ${CMD}\""
docker run ${RUN_OPTS[@]} ${IMAGE} bash -c "$(get_switch_user_cmd) ${CMD}"
Loading