Skip to content

Commit

Permalink
stable, dev PyTorch in Dockerfile and conda gh actions (#3074)
Browse files Browse the repository at this point in the history
* dockerfile and actions file

* dockerfile and actions file

* added pytorch conda cpu nightly

* added pytorch conda cpu nightly

* recopy base reqs

* gh action `include` torch nightly

* add pytorch nightly & conda gh badge

* rebase

* fix horovod

* proposal refactor

* Update .github/workflows/ci_pt-conda.yml

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update .github/workflows/ci_pt-conda.yml

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* update

* update

* fix cmd

* filled &&

* fix

* add -y

* torchvision >0.7 allowed

* explicitly install torchvision

* use HOROVOD_GPU_OPERATIONS env variable

* CI

* skip 1.7

* table

Co-authored-by: Jirka Borovec <jirka@pytorchlightning.ai>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
  • Loading branch information
3 people authored Sep 17, 2020
1 parent 7b64472 commit 8be79a9
Show file tree
Hide file tree
Showing 9 changed files with 127 additions and 73 deletions.
4 changes: 2 additions & 2 deletions .codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ codecov:
strict_yaml_branch: "yaml-config"
require_ci_to_pass: yes
notify:
after_n_builds: 22
after_n_builds: 23
wait_for_ci: yes
# https://docs.codecov.io/docs/codecov-yaml#section-expired-reports
max_report_age: off
Expand Down Expand Up @@ -64,4 +64,4 @@ comment:
layout: header, diff
require_changes: false
behavior: default # update if exists else create new
after_n_builds: 22
after_n_builds: 23
45 changes: 25 additions & 20 deletions .github/workflows/ci_dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,23 +49,28 @@ jobs:
push: false
timeout-minutes: 40

# TODO: uncomment this with fixing CUDA docker, no need to increase mergify count
# build-cuda:
# runs-on: ubuntu-20.04
# strategy:
# fail-fast: false
# matrix:
# python_version: [3.7]
# pytorch_version: [1.5]
# steps:
# - name: Checkout
# uses: actions/checkout@v2
#
# - name: Publish Master to Docker
# # publish master
# uses: docker/build-push-action@v1.1.0
# with:
# dockerfile: dockers/base-cuda/Dockerfile
# build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }}
# push: false
# timeout-minutes: 40
build-cuda:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
python_version: [3.7]
pytorch_version: [1.6]
pytorch_channel: [pytorch]
# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations
include:
- python_version: 3.7
pytorch_version: 1.7
pytorch_channel: pytorch-nightly
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Build Docker
# publish master
uses: docker/build-push-action@v1.1.0
with:
dockerfile: dockers/base-cuda/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},PYTORCH_CHANNEL=${{ matrix.pytorch_channel }}
push: false
timeout-minutes: 40
13 changes: 8 additions & 5 deletions .github/workflows/ci_pt-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,24 @@ jobs:
matrix:
os: [ubuntu-20.04]
python-version: [3.7]
# todo: add nightly versions
pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7
pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7 # TODO: fix failing test and add 1.7 (nightly) add badge

# Timeout: https://stackoverflow.com/a/59076067/4521646
timeout-minutes: 35
steps:
- uses: actions/checkout@v2

- name: Setup pyTorch
- name: Setup PyTorch nightly channel
if: matrix.pytorch-version >= 1.7
run: |
# NOTE: this requires that the channel is presented in the yaml before packages
python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', 'pytorch-nightly', 1) ; open(fname, 'w').write(req)"
- name: Setup PyTorch version
run: |
python -c "fname = 'environment.yml' ; req = open(fname).read().replace('torch>=1.3', 'torch=${{ matrix.pytorch-version }}') ; open(fname, 'w').write(req)"
cat environment.yml
# TODO: set source for nightly

- name: Cache conda
uses: actions/cache@v2
with:
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/docker-builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,13 @@ jobs:
fail-fast: false
matrix:
python_version: [3.7]
pytorch_version: [1.3, 1.4, 1.5, 1.6.0]
pytorch_version: [1.3, 1.4, 1.5, 1.6]
pytorch_channel: [pytorch]
# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations
include:
- python_version: 3.7
pytorch_version: 1.7
pytorch_channel: pytorch-nightly
steps:
- name: Checkout
uses: actions/checkout@v2
Expand All @@ -96,6 +102,6 @@ jobs:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
dockerfile: dockers/base-cuda/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }}
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},PYTORCH_CHANNEL=${{ matrix.pytorch_channel }}
tags: "base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
timeout-minutes: 40
2 changes: 1 addition & 1 deletion .mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pull_request_rules:
# no requested chnages from any reviewer
- "#changes-requested-reviews-by=0"
# this serves as ALL check has to pass as we have actually around 40 tests in total
- "#status-success>=44"
- "#status-success>=47"
# this is just in case since we rely on GPU tests (note: redundand to the above)
- status-success=continuous-integration/drone/pr
- "status-success=ci/circleci: TPU-tests"
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ Get started with our [3 steps guide](https://pytorch-lightning.readthedocs.io/en

| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) |
| :---: | :---: | :---: | :---: | :---: |
| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://35.192.60.23/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://35.192.60.23/PyTorchLightning/pytorch-lightning) |
| Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) |
| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22)
| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |

- _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_
- _\** tests run on two NVIDIA K80_
Expand Down
117 changes: 78 additions & 39 deletions dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,59 +1,98 @@
# Existing images:
# --build-arg TORCH_VERSION=1.6.0 --build-arg CUDA_VERSION=10.1
# --build-arg TORCH_VERSION=1.5 --build-arg CUDA_VERSION=10.1
# --build-arg TORCH_VERSION=1.4 --build-arg CUDA_VERSION=10.1
# --build-arg TORCH_VERSION=1.3 --build-arg CUDA_VERSION=10.1
# --build-arg TORCH_VERSION=1.2 --build-arg CUDA_VERSION=10.0
# --build-arg TORCH_VERSION=1.1.0 --build-arg CUDA_VERSION=10.0 --build-arg CUDNN_VERSION=7.5

ARG TORCH_VERSION=1.6.0
ARG CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg PYTORCH_CHANNEL=pytorch-nightly --build-arg CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1

ARG CUDNN_VERSION=7
ARG CUDA_VERSION=10.1

# TODO: make his imagge from pure Ubuntu + install all NVIDIA drivers
# FROM nvidia/cuda:${CUDA_VERSION}-base
FROM pytorch/pytorch:${TORCH_VERSION}-cuda${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel
FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel

ARG PYTHON_VERSION=3.7
ARG PYTORCH_VERSION=1.6
ARG PYTORCH_CHANNEL=pytorch
ARG CONDA_VERSION=4.7.12

SHELL ["/bin/bash", "-c"]

ENV HOROVOD_GPU_ALLREDUCE=NCCL
ENV HOROVOD_GPU_BROADCAST=NCCL
ENV HOROVOD_GPU_OPERATIONS=NCCL
ENV HOROVOD_WITH_PYTORCH=1
ENV HOROVOD_WITHOUT_TENSORFLOW=1
ENV HOROVOD_WITHOUT_MXNET=1
ENV HOROVOD_WITH_GLOO=1
ENV HOROVOD_WITHOUT_MPI=1
ENV PATH="$PATH:/root/.local/bin"
ENV MAKEFLAGS="-j$(nproc)"
# TODO: uncomment in horovod next release, https://github.com/horovod/horovod/pull/2239
# ENV MAKEFLAGS="-j$(nproc)"

COPY ./tests/install_AMP.sh install_AMP.sh
COPY ./requirements/base.txt requirements.txt
COPY ./requirements/extra.txt requirements-extra.txt
COPY ./requirements/test.txt requirements-tests.txt
COPY ./requirements/examples.txt requirements-examples.txt

RUN apt-get update && \
apt-get install -y \
git \
cmake \
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
ca-certificates \
&& \

# Install AMP
bash install_AMP.sh && \
# Install all requirements
pip install -r requirements.txt && \
# HOROVOD_BUILD_ARCH_FLAGS="-mfma" && \
pip install -r requirements-extra.txt && \
pip install -r requirements-examples.txt && \
#pip install -r requirements-tests.txt && \
rm install_AMP.sh && \
rm requirements* && \

# Cleaning
# Cleaning
apt-get autoremove -y && \
apt-get clean && \
rm -rf /root/.cache && \
rm -rf /var/lib/apt/lists/*

# add non-root user
RUN useradd --create-home --shell /bin/bash flash

# Show what we have
USER flash
ENV CONDA_ENV=lightning
ENV WORKDIR=/home/flash
WORKDIR $WORKDIR

COPY --chown=flash environment.yml environment.yml

# install conda and python
RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b -p ${WORKDIR}/miniconda && \
rm ~/miniconda.sh

# add conda to path
ENV PATH="${WORKDIR}/miniconda/bin:$PATH"
ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/lib:$LD_LIBRARY_PATH"
ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"

# conda init
RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION pytorch=$PYTORCH_VERSION torchvision cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \
conda init bash && \
# NOTE: this requires that the channel is presented in the yaml before packages
python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \
conda env update --file environment.yml && \
conda clean -ya && \
rm environment.yml && \
# Disable cache
conda install "pip>20.1" -y && \
pip config set global.cache-dir false

ENV PATH ${WORKDIR}/miniconda/envs/${CONDA_ENV}/bin:$PATH
ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH"
# if you want this environment to be the default one, uncomment the following line:
ENV CONDA_DEFAULT_ENV=${CONDA_ENV}

COPY ./requirements/test.txt requirements-tests.txt
COPY ./requirements/examples.txt requirements-examples.txt

RUN \
echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \
source ~/.bashrc && \
# Install all requirements
pip install -r requirements-tests.txt --upgrade-strategy only-if-needed && \
pip install -r requirements-examples.txt --upgrade-strategy only-if-needed && \
rm requirements* && \
# Show what we have
pip --version && \
conda info && \
conda list && \
pip list

CMD ["bin/bash"]
3 changes: 2 additions & 1 deletion dockers/conda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-devel
# install versions
ARG PYTHON_VERSION=3.7
ARG PYTORCH_VERSION=1.4
ARG PYTORCH_CHANNEL=pytorch
ARG LIGHTNING_VERSION=""
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
ARG CONDA_VERSION=4.7.12
Expand Down Expand Up @@ -47,7 +48,7 @@ COPY --chown=flash environment.yml environment.yml
RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION && \
conda init bash && \
# conda install -y python=$PYTHON_VERSION && \
conda install pytorch=$PYTORCH_VERSION cudatoolkit=$CUDA_VERSION --channel=pytorch && \
conda install pytorch=$PYTORCH_VERSION cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \
conda env update --file environment.yml && \
rm environment.yml && \

Expand Down
2 changes: 1 addition & 1 deletion requirements/examples.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
torchvision>=0.4.0, <0.7
torchvision>=0.4.0
gym>=0.17.0

0 comments on commit 8be79a9

Please sign in to comment.