NVIDIA · azrael417 · Sep 5, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,6 +14,7 @@ set(TORCHFORT_YAML_CPP_ROOT CACHE STRING "Path to search for yaml-cpp installati
 option(TORCHFORT_BUILD_FORTRAN "Build Fortran bindings" ON)
 option(TORCHFORT_BUILD_EXAMPLES "Build examples" OFF)
 option(TORCHFORT_BUILD_TESTS "Build tests" OFF)
+option(TORCHFORT_ENABLE_GPU "Enable GPU/CUDA support" ON)
 
 # For backward-compatibility with existing variable
 if (YAML_CPP_ROOT)
@@ -54,51 +55,53 @@ endif()
 find_package(MPI REQUIRED)
 
 # CUDA
-find_package(CUDAToolkit REQUIRED)
-
-# HPC SDK
-# Locate and append NVHPC CMake configuration if available
-find_program(NVHPC_CXX_BIN "nvc++")
-if (NVHPC_CXX_BIN)
- string(REPLACE "compilers/bin/nvc++" "cmake" NVHPC_CMAKE_DIR ${NVHPC_CXX_BIN})
- set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH};${NVHPC_CMAKE_DIR}")
- find_package(NVHPC COMPONENTS "")
-endif()
-
-# Get NCCL library (with optional override)
-if (TORCHFORT_NCCL_ROOT)
- find_path(NCCL_INCLUDE_DIR REQUIRED
- NAMES nccl.h
- HINTS ${TORCHFORT_NCCL_ROOT}/include
- )
-
- find_library(NCCL_LIBRARY REQUIRED
- NAMES nccl
- HINTS ${TORCHFORT_NCCL_ROOT}/lib
- )
-else()
- if (NVHPC_FOUND)
- find_package(NVHPC REQUIRED COMPONENTS NCCL)
- find_library(NCCL_LIBRARY
+if (TORCHFORT_ENABLE_GPU)
+ find_package(CUDAToolkit REQUIRED)
+
+ # HPC SDK
+ # Locate and append NVHPC CMake configuration if available
+ find_program(NVHPC_CXX_BIN "nvc++")
+ if (NVHPC_CXX_BIN)
+ string(REPLACE "compilers/bin/nvc++" "cmake" NVHPC_CMAKE_DIR ${NVHPC_CXX_BIN})
+ set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH};${NVHPC_CMAKE_DIR}")
+ find_package(NVHPC COMPONENTS "")
+ endif()
+
+ # Get NCCL library (with optional override)
+ if (TORCHFORT_NCCL_ROOT)
+ find_path(NCCL_INCLUDE_DIR REQUIRED
+ NAMES nccl.h
+ HINTS ${TORCHFORT_NCCL_ROOT}/include
+ )
+
+ find_library(NCCL_LIBRARY REQUIRED
  NAMES nccl
- HINTS ${NVHPC_NCCL_LIBRARY_DIR}
+ HINTS ${TORCHFORT_NCCL_ROOT}/lib
  )
- string(REPLACE "/lib" "/include" NCCL_INCLUDE_DIR ${NVHPC_NCCL_LIBRARY_DIR})
  else()
- message(FATAL_ERROR "Cannot find NCCL library. Please set TORCHFORT_NCCL_ROOT to NCCL installation directory.")
+ if (NVHPC_FOUND)
+ find_package(NVHPC REQUIRED COMPONENTS NCCL)
+ find_library(NCCL_LIBRARY
+ NAMES nccl
+ HINTS ${NVHPC_NCCL_LIBRARY_DIR}
+ )
+ string(REPLACE "/lib" "/include" NCCL_INCLUDE_DIR ${NVHPC_NCCL_LIBRARY_DIR})
+ else()
+ message(FATAL_ERROR "Cannot find NCCL library. Please set TORCHFORT_NCCL_ROOT to NCCL installation directory.")
+ endif()
  endif()
+
+ message(STATUS "Using NCCL library: ${NCCL_LIBRARY}")
+
+ # PyTorch
+ # Set TORCH_CUDA_ARCH_LIST string to match TORCHFORT_CUDA_CC_LIST
+ foreach(CUDA_CC ${TORCHFORT_CUDA_CC_LIST})
+ string(REGEX REPLACE "([0-9])$" ".\\1" CUDA_CC_W_DOT ${CUDA_CC})
+ list(APPEND TORCH_CUDA_ARCH_LIST ${CUDA_CC_W_DOT})
+ endforeach()
+ list(JOIN TORCH_CUDA_ARCH_LIST " " TORCH_CUDA_ARCH_LIST)
 endif()
 
-message(STATUS "Using NCCL library: ${NCCL_LIBRARY}")
-
-# PyTorch
-# Set TORCH_CUDA_ARCH_LIST string to match TORCHFORT_CUDA_CC_LIST
-foreach(CUDA_CC ${TORCHFORT_CUDA_CC_LIST})
- string(REGEX REPLACE "([0-9])$" ".\\1" CUDA_CC_W_DOT ${CUDA_CC})
- list(APPEND TORCH_CUDA_ARCH_LIST ${CUDA_CC_W_DOT})
-endforeach()
-list(JOIN TORCH_CUDA_ARCH_LIST " " TORCH_CUDA_ARCH_LIST)
-
 find_package(Torch REQUIRED)
 
 # yaml-cpp
@@ -160,16 +163,22 @@ target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
 target_link_libraries(${PROJECT_NAME} PRIVATE ${NCCL_LIBRARY})
 target_link_libraries(${PROJECT_NAME} PRIVATE MPI::MPI_CXX)
 target_link_libraries(${PROJECT_NAME} PRIVATE ${YAML_CPP_LIBRARY})
-target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart)
 
 target_include_directories(${PROJECT_NAME}
  PRIVATE
  ${YAML_CPP_INCLUDE_DIR}
  ${MPI_CXX_INCLUDE_DIRS}
  ${TORCH_INCLUDE_DIRS}
+)
+if (TORCHFORT_ENABLE_GPU)
+ target_include_directories(${PROJECT_NAME}
+ PRIVATE
  ${CUDAToolkit_INCLUDE_DIRS}
  ${NCCL_INCLUDE_DIR}
-)
+ )
+ target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart)
+ target_compile_definitions(${PROJECT_NAME} PRIVATE ENABLE_GPU)
+endif()
 target_compile_definitions(${PROJECT_NAME} PRIVATE YAML_CPP_STATIC_DEFINE)
 target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${TORCH_CXX_FLAGS}>)
 

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -24,15 +24,15 @@ ENV CUDA_HOME /opt/nvidia/hpc_sdk/Linux_x86_64/24.1/cuda
 
 RUN echo "source /opt/nvidia/hpc_sdk/Linux_x86_64/24.1/comm_libs/12.3/hpcx/latest/hpcx-init.sh; hpcx_load" >> /root/.bashrc
 
-# Install NCCL 2.20.3 for compatibility with PyTorch 2.2.1
+# Install newer NCCL for compatibility with PyTorch 2.2.1+
 RUN cd /opt && \
  git clone --branch v2.20.3-1 https://github.com/NVIDIA/nccl.git && \
  cd nccl && \
  make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90" CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/24.1/cuda
 ENV LD_LIBRARY_PATH /opt/nccl/build/lib:$LD_LIBRARY_PATH
 
 # Install PyTorch
-RUN pip3 install torch==2.2.1 torchvision torchaudio
+RUN pip3 install torch==2.4.0
 
 # Install yaml-cpp
 RUN git clone https://github.com/jbeder/yaml-cpp.git --branch 0.8.0 && \
@@ -74,7 +74,7 @@ RUN cd /torchfort && mkdir build && cd build && \
  -DTORCHFORT_BUILD_TESTS=1 \
  -DCMAKE_PREFIX_PATH="`python -c 'import torch;print(torch.utils.cmake_prefix_path)'`" \
  .. && \
- make VERBOSE=1 -j$(nproc) install && \
+ make -j$(nproc) install && \
  cd / && rm -rf torchfort
 ENV LD_LIBRARY_PATH /opt/torchfort/lib:${LD_LIBRARY_PATH}
 ENV LD_LIBRARY_PATH /usr/local/lib/python3.10/dist-packages/torch/lib:${LD_LIBRARY_PATH}

diff --git a/docker/Dockerfile_gnu b/docker/Dockerfile_gnu
@@ -1,8 +1,12 @@
 FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu22.04
 
 # Install System Dependencies
+ENV DEBIAN_FRONTEND noninteractive
 RUN apt update -y && \
- DEBIAN_FRONTEND=noninteractive apt install -y curl unzip wget cmake python3 python-is-python3 python3-pip python3-pybind11 git vim gfortran doxygen libibverbs-dev
+ apt install -y curl unzip wget cmake && \
+ apt install -y python3 python-is-python3 python3-pip python3-pybind11 && \
+ apt install -y git vim gfortran doxygen && \
+ apt install -y libibverbs-dev ibverbs-utils numactl
 
 # Download HPCX and compile with Fortran support
 RUN cd /opt && \
@@ -32,15 +36,15 @@ ENV LD_LIBRARY_PATH /opt/hpcx/ompi/lib:$LD_LIBRARY_PATH
 
 RUN echo "source /opt/hpcx/hpcx-init.sh; hpcx_load" >> /root/.bashrc
 
-# Install NCCL 2.20.3 for compatibility with PyTorch 2.2.1
+# Install newer NCCL for compatibility with PyTorch 2.2.1+
 RUN cd /opt && \
  git clone --branch v2.20.3-1 https://github.com/NVIDIA/nccl.git && \
  cd nccl && \
  make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90"
 ENV LD_LIBRARY_PATH /opt/nccl/build/lib:$LD_LIBRARY_PATH
 
 # Install PyTorch
-RUN pip3 install torch==2.2.1 torchvision torchaudio
+RUN pip3 install torch==2.4.0
 
 # Install yaml-cpp
 RUN git clone https://github.com/jbeder/yaml-cpp.git --branch 0.8.0 && \
@@ -78,6 +82,7 @@ RUN cd /torchfort && mkdir build && cd build && \
  -DTORCHFORT_YAML_CPP_ROOT=/opt/yaml-cpp \
  -DTORCHFORT_NCCL_ROOT=/opt/nccl/build \
  -DTORCHFORT_BUILD_EXAMPLES=1 \
+ -DTORCHFORT_BUILD_TESTS=1 \
  -DCMAKE_PREFIX_PATH="`python -c 'import torch;print(torch.utils.cmake_prefix_path)'`" \
  .. && \
  make -j$(nproc) install && \

diff --git a/docker/Dockerfile_gnu_cpuonly b/docker/Dockerfile_gnu_cpuonly
@@ -0,0 +1,75 @@
+FROM ubuntu:22.04
+
+# Install System Dependencies
+ENV DEBIAN_FRONTEND noninteractive
+RUN apt update -y && \
+ apt install -y build-essential && \
+ apt install -y curl unzip wget cmake && \
+ apt install -y python3 python-is-python3 python3-pip python3-pybind11 && \
+ apt install -y git vim gfortran doxygen && \
+ apt install -y libibverbs-dev ibverbs-utils numactl
+
+# Download OpenMPI and compile with Fortran support
+RUN cd /opt && \
+ wget https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.5.tar.gz && \
+ tar xzf openmpi-5.0.5.tar.gz && \
+ cd openmpi-5.0.5 && \
+ FC=gfortran CC=gcc CXX=g++ ./configure --prefix=/opt/openmpi \
+ --with-libevent=internal \
+ --enable-mpi1-compatibility \
+ --without-xpmem \
+ --with-slurm && \
+ make -j$(nproc) install && \
+ cd /opt && rm -rf openmpi-5.0.5 && rm openmpi-5.0.5.tar.gz 
+
+ENV PATH /opt/openmpi/bin:$PATH
+ENV LD_LIBRARY_PATH /opt/openmpi/lib:$LD_LIBRARY_PATH
+
+# Install PyTorch
+RUN pip3 install torch==2.4.0 --index-url https://download.pytorch.org/whl/cpu
+
+# Install yaml-cpp
+RUN git clone https://github.com/jbeder/yaml-cpp.git --branch 0.8.0 && \
+ cd yaml-cpp && \
+ mkdir build && cd build && \
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/yaml-cpp \
+ -DCMAKE_CXX_FLAGS:="-D_GLIBCXX_USE_CXX11_ABI=0" \
+ -DBUILD_SHARED_LIBS=OFF \
+ -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && \
+ make -j$(nproc) && make install
+ENV LD_LIBRARY_PATH /opt/yaml-cpp/lib:${LD_LIBRARY_PATH}
+
+# Install HDF5
+RUN wget https://github.com/HDFGroup/hdf5/archive/refs/tags/hdf5-1_14_3.tar.gz && \
+ tar xzf hdf5-1_14_3.tar.gz && \
+ cd hdf5-hdf5-1_14_3 && \
+ CC=mpicc FC=mpifort \
+ ./configure --enable-parallel \
+ --enable-fortran \
+ --prefix=/opt/hdf5 && \
+ make -j$(nproc) install && \
+ cd .. && \
+ rm -rf hdf5-hdf5-1_14_3 hdf5-1_14_3.tar.gz
+ENV LD_LIBRARY_PATH /opt/hdf5/lib:$LD_LIBRARY_PATH
+
+# Install additional Python dependencies
+RUN pip3 install wandb ruamel-yaml h5py matplotlib pygame moviepy
+
+# Install TorchFort without GPU support
+ENV FC=gfortran
+ENV HDF5_ROOT=/opt/hdf5
+COPY . /torchfort
+RUN cd /torchfort && mkdir build && cd build && \
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/torchfort \
+ -DTORCHFORT_YAML_CPP_ROOT=/opt/yaml-cpp \
+ -DTORCHFORT_ENABLE_GPU=0 \
+ -DTORCHFORT_BUILD_EXAMPLES=1 \
+ -DTORCHFORT_BUILD_TESTS=1 \
+ -DCMAKE_PREFIX_PATH="`python -c 'import torch;print(torch.utils.cmake_prefix_path)'`" \
+ .. && \
+ make -j$(nproc) install && \
+ cd / && rm -rf torchfort
+ENV LD_LIBRARY_PATH /opt/torchfort/lib:${LD_LIBRARY_PATH}
+ENV LD_LIBRARY_PATH /usr/local/lib/python3.10/dist-packages/torch/lib:${LD_LIBRARY_PATH}
+
+ENTRYPOINT bash
diff --git a/examples/cpp/cart_pole/CMakeLists.txt b/examples/cpp/cart_pole/CMakeLists.txt
@@ -35,9 +35,15 @@ foreach(tgt ${cart_pole_example_targets})
  target_link_libraries(${tgt} PRIVATE MPI::MPI_CXX)
  target_link_libraries(${tgt} PRIVATE ${YAML_CPP_LIBRARY})
  target_link_libraries(${tgt} PRIVATE environments)
- target_link_libraries(${tgt} PRIVATE CUDA::cudart)
  target_compile_options(${tgt} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${TORCH_CXX_FLAGS}>)
  target_link_options(${tgt} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${TORCH_CXX_FLAGS}>)
+ if (TORCHFORT_ENABLE_GPU)
+ target_include_directories(${tgt}
+ PRIVATE
+ ${CUDAToolkit_INCLUDE_DIRS}
+ )
+ target_link_libraries(${tgt} PRIVATE CUDA::cudart)
+ endif()
 endforeach()
 
 # installation

diff --git a/examples/cpp/cart_pole/config.yaml b/examples/cpp/cart_pole/config.yaml
@@ -14,7 +14,7 @@ algorithm:
  gamma: 0.99
  rho: 0.99
 
-action:
+actor:
  type: space_noise
  parameters:
  a_low: -1.0

diff --git a/examples/cpp/cart_pole/python/initialize_models.py b/examples/cpp/cart_pole/python/initialize_models.py
@@ -40,10 +40,13 @@ def main(args):
 
  # set seed
  torch.manual_seed(666)
- torch.cuda.manual_seed(666)
-
- # script model:
- device = torch.device("cuda:0")
+
+ # CUDA check
+ if torch.cuda.is_available():
+ torch.cuda.manual_seed(666)
+ device = torch.device("cuda:0")
+ else:
+ device = torch.device("cpu")
 
  # parameters
  batch_size = 64

diff --git a/examples/cpp/cart_pole/python/visualize.py b/examples/cpp/cart_pole/python/visualize.py
@@ -130,11 +130,14 @@ def main(args):
 
  # set seed
  torch.manual_seed(666)
- torch.cuda.manual_seed(666)
-
- # script model:
- device = torch.device("cuda:0")
 
+ # CUDA check
+ if torch.cuda.is_available():
+ torch.cuda.manual_seed(666)
+ device = torch.device("cuda:0")
+ else:
+ device = torch.device("cpu")
+
  # parameters
  batch_size = 1