Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump to Cuda 11.1; Implement fix for #157 #174

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Handy commands:
# - `make docker-build`: builds DOCKERIMAGE (default: `packnet-sfm:latest`)
PROJECT ?= packnet-sfm
PROJECT ?= stellarpower/packnet-sfm
WORKSPACE ?= /workspace/$(PROJECT)
DOCKER_IMAGE ?= ${PROJECT}:latest

Expand Down Expand Up @@ -79,4 +79,4 @@ docker-run: docker-build

docker-run-mpi: docker-build
nvidia-docker run ${DOCKER_OPTS} ${DOCKER_IMAGE} \
bash -c "${MPI_CMD} ${COMMAND}"
bash -c "${MPI_CMD} ${COMMAND}"
62 changes: 52 additions & 10 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,48 @@
# Copyright 2020 Toyota Research Institute. All rights reserved.

FROM nvidia/cuda:10.2-devel-ubuntu18.04
# CUDA_VERSION is defined in the image, so add prefix
ARG PSFM_CUDA_MAJOR_VERSION=11
ARG PSFM_CUDA_MINOR_VERSION=1
ARG PSFM_CUDA_VERSION=${PSFM_CUDA_MAJOR_VERSION}.${PSFM_CUDA_MINOR_VERSION}

ARG SOURCE_IMAGE=nvidia/cuda:${PSFM_CUDA_VERSION}-devel-ubuntu18.04
########################################
FROM $SOURCE_IMAGE as base
# Reïntroduce variables from before:
ARG PSFM_CUDA_MAJOR_VERSION
ARG PSFM_CUDA_MINOR_VERSION
ARG PSFM_CUDA_VERSION
ARG SOURCE_IMAGE

ENV PROJECT=packnet-sfm
ENV PYTORCH_VERSION=1.8.1
ENV TORCHVISION_VERSION=0.9.1
ENV CUDNN_VERSION=7.6.5.32-1+cuda10.2
ENV NCCL_VERSION=2.7.8-1+cuda10.2

# https://lambdalabs.com/blog/install-tensorflow-and-pytorch-on-rtx-30-series/
# 3090 => CUDA 11.1 OK
# => cuDNN 8.2.1

# From https://github.com/pytorch/vision:
# Torch 1.9.0 <=> Torchvision 0.10.0

# From https://pytorch.org/get-started/locally/
# Torch 1.9.0 => CUDA 11.1 || CUDA 10.2

#ENV CUDNN_VERSION=7.6.5.32-1+cuda10.2
#ENV NCCL_VERSION=2.7.8-1+cuda10.2

ENV CUDNN_VERSION=8.0.5.39-1+cuda${PSFM_CUDA_VERSION}
# Use the variable as a sanity check that we're using the right CUDA
ARG CUDNN_PACKAGE=libcudnn8=${CUDNN_VERSION}
ENV NCCL_VERSION=2.7.8-1+cuda${PSFM_CUDA_VERSION}
ARG NCCL_PACKAGE=libnccl2=${NCCL_VERSION}

ENV PYTORCH_VERSION=1.9.0
ENV TORCHVISION_VERSION=0.10.0


RUN echo "Using " ${SOURCE_IMAGE} " with CUDA version " ${PSFM_CUDA_VERSION} \
" and Pytorch/torchvision " ${PYTORCH_VERSION}/${TORCHVISION_VERSION} >&2


ENV HOROVOD_VERSION=65de4c961d1e5ad2828f2f6c4329072834f27661
ENV TRT_VERSION=6.0.1.5
ENV LC_ALL=C.UTF-8
Expand All @@ -29,8 +65,8 @@ RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-
vim \
wget \
ca-certificates \
libcudnn7=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION} \
${CUDNN_PACKAGE} \
${NCCL_PACKAGE} \
libnccl-dev=${NCCL_VERSION} \
libjpeg-dev \
libpng-dev \
Expand Down Expand Up @@ -85,8 +121,14 @@ RUN pip install future typing numpy pandas matplotlib jupyter h5py \
mpi4py onnx onnxruntime pycuda yacs cython==0.29.10

# Install PyTorch
RUN pip install torch==${PYTORCH_VERSION} \
torchvision==${TORCHVISION_VERSION} && ldconfig
#RUN pip install torch==${PYTORCH_VERSION} \
# torchvision==${TORCHVISION_VERSION} && ldconfig
RUN pip3 install \
torch==${PYTORCH_VERSION}+cu${PSFM_CUDA_MAJOR_VERSION}${PSFM_CUDA_MINOR_VERSION} \
torchvision==${TORCHVISION_VERSION}+cu${PSFM_CUDA_MAJOR_VERSION}${PSFM_CUDA_MINOR_VERSION} \
-f https://download.pytorch.org/whl/torch_stable.html \
&& ldconfig


# Install apex
RUN mkdir /workspace
Expand Down Expand Up @@ -141,4 +183,4 @@ WORKDIR /workspace/${PROJECT}
# Copy project source last (to avoid cache busting)
WORKDIR /workspace/${PROJECT}
COPY . /workspace/${PROJECT}
ENV PYTHONPATH="/workspace/${PROJECT}:$PYTHONPATH"
ENV PYTHONPATH="/workspace/${PROJECT}:$PYTHONPATH"
4 changes: 2 additions & 2 deletions packnet_sfm/models/model_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,14 +292,14 @@ def evaluate_depth(self, batch):
"""Evaluate batch to produce depth metrics."""
# Get predicted depth
inv_depths = self.model(batch)['inv_depths']
depth = inv2depth(inv_depths[0])
depth = inv2depth(inv_depths)
# Post-process predicted depth
batch['rgb'] = flip_lr(batch['rgb'])
if 'input_depth' in batch:
batch['input_depth'] = flip_lr(batch['input_depth'])
inv_depths_flipped = self.model(batch)['inv_depths']
inv_depth_pp = post_process_inv_depth(
inv_depths[0], inv_depths_flipped[0], method='mean')
inv_depths, inv_depths_flipped, method='mean')
depth_pp = inv2depth(inv_depth_pp)
batch['rgb'] = flip_lr(batch['rgb'])
# Calculate predicted metrics
Expand Down