* Upgrade presets for PyTorch 2.4.0 (pull #1510)

* Enable distributed package using Gloo in presets for PyTorch
bytedeco · Sep 1, 2024 · dc8e6a5 · dc8e6a5
1 parent 3af42f8
commit dc8e6a5
Show file tree

Hide file tree

Showing 1,615 changed files with 15,128 additions and 8,446 deletions.
diff --git a/.github/workflows/pytorch.yml b/.github/workflows/pytorch.yml
@@ -33,7 +33,7 @@ jobs:
       - uses: bytedeco/javacpp-presets/.github/actions/deploy-ubuntu@actions
         timeout-minutes: 350
   macosx-arm64:
-    runs-on: macos-12
+    runs-on: macos-14
     steps:
       - uses: bytedeco/javacpp-presets/.github/actions/deploy-macosx@actions
   macosx-x86_64:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,5 @@
 
+ * Enable distributed package using Gloo in presets for PyTorch ([pull #1510](https://github.com/bytedeco/javacpp-presets/pull/1510))
  * Add presets for the CUPTI module of CUDA ([pull #1531](https://github.com/bytedeco/javacpp-presets/pull/1531))
  * Add new `ClangMemoryMgmtExample` in samples for LLVM ([pull #1522](https://github.com/bytedeco/javacpp-presets/pull/1522))
  * Enable `opencv_python3` module for `macosx-arm64` as well ([pull #1517](https://github.com/bytedeco/javacpp-presets/pull/1517))
@@ -8,7 +9,7 @@
  * Build FFmpeg with zimg to enable zscale filter ([pull #1481](https://github.com/bytedeco/javacpp-presets/pull/1481))
  * Enable PulseAudio support for FFmpeg on Linux ([pull #1472](https://github.com/bytedeco/javacpp-presets/pull/1472))
  * Virtualize `btCollisionWorld`, `btOverlapFilterCallback`, `btOverlapCallback` from Bullet Physics SDK ([pull #1475](https://github.com/bytedeco/javacpp-presets/pull/1475))
- * Upgrade presets for OpenCV 4.10.0, FFmpeg 7.0.2, Spinnaker 4.0.0.116 ([pull #1524](https://github.com/bytedeco/javacpp-presets/pull/1524)), DNNL 3.5.3, OpenBLAS 0.3.28, CMINPACK 1.3.9, GSL 2.8, CPython 3.12.5, NumPy 2.0.1, SciPy 1.14.0, LLVM 18.1.8, LibRaw 0.21.2 ([pull #1520](https://github.com/bytedeco/javacpp-presets/pull/1520)), Tesseract 5.4.1, libffi 3.4.6, CUDA 12.6.0, cuDNN 9.3.0, NCCL 2.22.3, nvCOMP 4.0.0, OpenCL 3.0.16, NVIDIA Video Codec SDK 12.2.72, PyTorch 2.3.0 ([pull #1466](https://github.com/bytedeco/javacpp-presets/pull/1466)), SentencePiece 0.2.0, TensorFlow Lite 2.17.0, TensorRT 10.3.0.26, Triton Inference Server 2.48.0, ONNX 1.16.2, ONNX Runtime 1.18.1, TVM 0.17.0, and their dependencies
+ * Upgrade presets for OpenCV 4.10.0, FFmpeg 7.0.2, Spinnaker 4.0.0.116 ([pull #1524](https://github.com/bytedeco/javacpp-presets/pull/1524)), DNNL 3.5.3, OpenBLAS 0.3.28, CMINPACK 1.3.9, GSL 2.8, CPython 3.12.5, NumPy 2.0.1, SciPy 1.14.0, LLVM 18.1.8, LibRaw 0.21.2 ([pull #1520](https://github.com/bytedeco/javacpp-presets/pull/1520)), Tesseract 5.4.1, libffi 3.4.6, CUDA 12.6.0, cuDNN 9.3.0, NCCL 2.22.3, nvCOMP 4.0.0, OpenCL 3.0.16, NVIDIA Video Codec SDK 12.2.72, PyTorch 2.4.0 ([pull #1466](https://github.com/bytedeco/javacpp-presets/pull/1466)), SentencePiece 0.2.0, TensorFlow Lite 2.17.0, TensorRT 10.3.0.26, Triton Inference Server 2.48.0, ONNX 1.16.2, ONNX Runtime 1.18.1, TVM 0.17.0, and their dependencies
 
 ### January 29, 2024 version 1.5.10
  * Introduce `macosx-arm64` builds for PyTorch ([pull #1463](https://github.com/bytedeco/javacpp-presets/pull/1463))

diff --git a/README.md b/README.md
@@ -223,7 +223,7 @@ Each child module in turn relies by default on the included [`cppbuild.sh` scrip
  * NVIDIA Video Codec SDK 12.2.x  https://developer.nvidia.com/nvidia-video-codec-sdk
  * OpenCL 3.0.x  https://github.com/KhronosGroup/OpenCL-ICD-Loader
  * MXNet 1.9.x  https://github.com/apache/incubator-mxnet
- * PyTorch 2.3.x  https://github.com/pytorch/pytorch
+ * PyTorch 2.4.x  https://github.com/pytorch/pytorch
  * SentencePiece 0.2.0  https://github.com/google/sentencepiece
  * TensorFlow 1.15.x  https://github.com/tensorflow/tensorflow
  * TensorFlow Lite 2.17.x  https://github.com/tensorflow/tensorflow

diff --git a/platform/pom.xml b/platform/pom.xml
@@ -292,7 +292,7 @@
     <dependency>
       <groupId>org.bytedeco</groupId>
       <artifactId>pytorch-platform</artifactId>
-      <version>2.3.0-${project.version}</version>
+      <version>2.4.0-${project.version}</version>
     </dependency>
     <dependency>
       <groupId>org.bytedeco</groupId>

diff --git a/pytorch/README.md b/pytorch/README.md
@@ -9,7 +9,7 @@ Introduction
 ------------
 This directory contains the JavaCPP Presets module for:
 
- * PyTorch 2.3.0  https://pytorch.org/
+ * PyTorch 2.4.0  https://pytorch.org/
 
 Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.
 
@@ -48,14 +48,14 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>pytorch-platform</artifactId>
-            <version>2.3.0-1.5.11-SNAPSHOT</version>
+            <version>2.4.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies required to use CUDA, cuDNN, and NCCL -->
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>pytorch-platform-gpu</artifactId>
-            <version>2.3.0-1.5.11-SNAPSHOT</version>
+            <version>2.4.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies to use bundled CUDA, cuDNN, and NCCL -->

diff --git a/pytorch/cppbuild.sh b/pytorch/cppbuild.sh
@@ -22,6 +22,9 @@ export USE_CUDNN=0
 export USE_NUMPY=0
 export USE_OPENMP=1
 export USE_SYSTEM_NCCL=1
+export USE_DISTRIBUTED=1
+export USE_NCCL=0 # Not supported on Windows
+
 if [[ "$EXTENSION" == *gpu ]]; then
     export USE_CUDA=1
     export USE_CUDNN=1
@@ -35,7 +38,7 @@ if [[ $PLATFORM == windows* ]]; then
     export PYTHON_BIN_PATH=$(which python.exe)
 fi
 
-PYTORCH_VERSION=2.3.0
+PYTORCH_VERSION=2.4.0
 
 export PYTORCH_BUILD_VERSION="$PYTORCH_VERSION"
 export PYTORCH_BUILD_NUMBER=1
@@ -44,6 +47,23 @@ mkdir -p "$PLATFORM$EXTENSION"
 cd "$PLATFORM$EXTENSION"
 INSTALL_PATH=`pwd`
 
+# Distributed needs libuv on Windows (on other platforms, it's included in tensorpipe)
+if [[ $PLATFORM == windows* ]]; then
+    if [[ ! -d libuv ]]; then
+        mkdir libuv
+        cd libuv
+        download https://dist.libuv.org/dist/v1.39.0/libuv-v1.39.0.tar.gz libuv.tgz
+        tar xfz libuv.tgz
+        mkdir build
+        cd build
+        cmake ../libuv-v1.39.0 -DBUILD_TESTING=OFF
+        cmake --build . --config Release
+        cmake --install . --config Release --prefix ../dist
+        cd ../..
+    fi
+    export libuv_ROOT=${INSTALL_PATH}/libuv/dist
+fi
+
 if [[ ! -d pytorch ]]; then
     git clone https://github.com/pytorch/pytorch
 fi
@@ -123,14 +143,16 @@ case $PLATFORM in
     macosx-arm64)
         export CC="clang"
         export CXX="clang++"
-        export CMAKE_OSX_ARCHITECTURES=arm64 # enable cross-compilation on a x86_64 host machine
+        # export PATH=$(brew --prefix llvm@18)/bin:$PATH # Use brew LLVM instead of Xcode LLVM 14
         export USE_MKLDNN=OFF
         export USE_QNNPACK=OFF # not compatible with arm64 as of PyTorch 2.1.2
         export CMAKE_OSX_DEPLOYMENT_TARGET=11.00 # minimum needed for arm64 support
         ;;
     macosx-x86_64)
         export CC="clang"
         export CXX="clang++"
+        export USE_MKLDNN=OFF
+        # export PATH=$(brew --prefix llvm@18)/bin:$PATH # Use brew LLVM instead of Xcode LLVM 14
         ;;
     windows-x86_64)
         if which ccache.exe; then
@@ -181,22 +203,53 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const nn::Module& modul
 ' torch/csrc/api/include/torch/nn/module.h
 sedinplace 's/char(\(.*\))/\1/g' torch/csrc/jit/serialization/pickler.h
 
+# some windows header defines a macro named "interface"
+sedinplace 's/const std::string& interface)/const std::string\& interface_name)/g' torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+
+# fix missing #include (Pytorch 2.4.0)
+sedinplace 's/#include <stdexcept>/#include <stdexcept>\
+#include <vector>\
+#include <unordered_map>/'  torch/csrc/distributed/c10d/control_plane/Handlers.cpp
+
+# Remove pytorch adaptations of FindOpenMP.cmake that.
+# On Windows without iomp and with new versions of VS 2019, including -openmp:experimental and libomp, causes
+# final binary to be linked to both libomp and vcomp and produce incorrect results.
+# Wait for eventual upstream fix, or for cmake 2.30 that allows to choose between -openmp and -openmp:experimental
+# and see if choosing experimental works. See Issue #1503.
+# On Linux, pytorch FindOpenMP.cmake picks llvm libomp over libgomp. See Issue #1504.
+# On MacOS CMake standard version works tooL
+rm cmake/Modules/FindOpenMP.cmake
+sedinplace 's/include(${CMAKE_CURRENT_LIST_DIR}\/Modules\/FindOpenMP.cmake)/find_package(OpenMP)/g' cmake/Dependencies.cmake
+
 #USE_FBGEMM=0 USE_KINETO=0 USE_GLOO=0 USE_MKLDNN=0 \
 "$PYTHON_BIN_PATH" setup.py build
 
 rm -Rf ../lib
+if [[ ! -e torch/include/gloo ]]; then
+    ln -sf ../../third_party/gloo/gloo torch/include
+fi
 ln -sf pytorch/torch/include ../include
 ln -sf pytorch/torch/lib ../lib
 ln -sf pytorch/torch/bin ../bin
 
-# fix library with correct rpath on Mac
 case $PLATFORM in
     macosx-*)
-        cp /usr/local/lib/libomp.dylib ../lib/libiomp5.dylib
+        # Disguise libomp as libiomp5 (they share the same codebase and have the same symbols)
+        # This helps if user wants to link with MKL.
+        # On linux, user linking with mkl would need to set
+        # MKL_THREADING_LAYER=GNU
+        cp "$(brew ls libomp|grep libomp.dylib)" ../lib/libiomp5.dylib
         chmod +w ../lib/libiomp5.dylib
         install_name_tool -id @rpath/libiomp5.dylib ../lib/libiomp5.dylib
-        install_name_tool -change @rpath/libomp.dylib @rpath/libiomp5.dylib ../lib/libtorch_cpu.dylib
+        codesign --force -s - ../lib/libiomp5.dylib
+        old=$(otool -L ../lib/libtorch_cpu.dylib|grep libomp.dylib|awk '{print $1}')
+        echo install_name_tool -change $old @rpath/libiomp5.dylib ../lib/libtorch_cpu.dylib
+        install_name_tool -change $old @rpath/libiomp5.dylib ../lib/libtorch_cpu.dylib
+        codesign --force -s - ../lib/libtorch_cpu.dylib
         ;;
+    windows-*)
+        cp ../libuv/dist/lib/Release/* ../lib
+	;;
 esac
 
 cd ../..
diff --git a/pytorch/include_list.pl b/pytorch/include_list.pl
@@ -18,7 +18,7 @@ ($)
     for (my $d = @inc_per_depth - 1; $d >= $min_depth; $d--) {
         if ($inc_per_depth[$d]) {
             foreach my $i (@{$inc_per_depth[$d]}) {
-                print "#include \"$i\"\n";
+                print "#include \"$i\"\n" unless $incs{$i};
                 $incs{$i} = 1;
             }
             undef $inc_per_depth[$d];
@@ -27,12 +27,20 @@ ($)
 }
 
 sub go {
-    my $path = join ' ', @_;
+    my ($roots, $opts) = @_;
+    my $path = join ' ', @$roots, @$opts;
+
+    my $exe = "g++ -I. -I torch/csrc/api/include/ -DUSE_UCC -DUSE_C10D_GLOO -DUSE_C10D_MPI -DUSE_DISTRIBUTED -H $path -E 2>&1 > /dev/null";
+    #my $exe = "g++ -I. -I torch/csrc/api/include/ -DUSE_UCC -DUSE_C10D_GLOO -DUSE_C10D_MPI -DUSE_DISTRIBUTED -D_WIN32 -H $path -E 2>&1 > /dev/null";
+    my @inc = `$exe`;
+    if ($? != 0) {
+      print STDERR "Failed:\n$exe\nError: $?: $!\n";
+      exit $?;
+    }
 
-    my @inc = `g++ -I. -I torch/csrc/api/include/ -H $path -E 2>&1 > /dev/null`;
     foreach my $i (@inc) {
         chomp $i;
-        my ($depth, $f) = $i =~ /^(\.+)\s(.*\.h)$/;
+        my ($depth, $f) = $i =~ /^(\.+)\s(.*\.h(?:pp)?)$/;
         next unless $depth;
         $depth = length($depth);
         $f =~ s#^\./##;
@@ -48,18 +56,33 @@ sub go {
         push @$incs, $f;
     }
     flush(0);
+    foreach my $i (@$roots) {
+      print "#include \"$i\"\n" unless $incs{$i};
+      $incs{$i} = 1;
+    }
 }
 
 chdir "cppbuild/linux-x86_64-gpu/pytorch/torch/include";
 
-go('torch/csrc/api/include/torch/torch.h', 'torch/script.h', 'torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h');
+print <<EOF;
+// Included by
+// torch/csrc/api/include/torch/torch.h
+// torch/script.h
+// torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h
+// torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+// torch/csrc/distributed/c10d/PrefixStore.hpp
+// torch/csrc/distributed/c10d/logger.hpp
+EOF
+
+go(['torch/csrc/api/include/torch/torch.h', 'torch/script.h', 'torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h', 'torch/csrc/distributed/c10d/ProcessGroupGloo.hpp', 'torch/csrc/distributed/c10d/PrefixStore.hpp', 'torch/csrc/distributed/c10d/logger.hpp'], []);
 
 print <<EOF;
 
 // Included by
-// ATen/cudnn/Descriptors.h
 // ATen/cudnn/Types.h
-// c10/cuda/CUDAGuard.h
+// ATen/cudnn/Descriptors.h
+// ATen/cuda/CUDAEvent.h
+// torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h
 EOF
 
-go('ATen/cudnn/Descriptors.h', 'ATen/cudnn/Types.h', 'c10/cuda/CUDAGuard.h', '-I/opt/cuda/targets/x86_64-linux/include/', 'torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h');
+go(['ATen/cudnn/Types.h', 'ATen/cudnn/Descriptors.h', 'ATen/cuda/CUDAEvent.h', 'torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h'], ['-I/opt/cuda/targets/x86_64-linux/include/', '-DUSE_CUDA']);
diff --git a/pytorch/platform/gpu/pom.xml b/pytorch/platform/gpu/pom.xml
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>pytorch-platform-gpu</artifactId>
-  <version>2.3.0-${project.parent.version}</version>
+  <version>2.4.0-${project.parent.version}</version>
   <name>JavaCPP Presets Platform GPU for PyTorch</name>
 
   <properties>

diff --git a/pytorch/platform/pom.xml b/pytorch/platform/pom.xml
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>pytorch-platform</artifactId>
-  <version>2.3.0-${project.parent.version}</version>
+  <version>2.4.0-${project.parent.version}</version>
   <name>JavaCPP Presets Platform for PyTorch</name>
 
   <properties>
@@ -41,6 +41,12 @@
       <version>${project.version}</version>
       <classifier>${javacpp.platform.linux-x86_64}</classifier>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>${javacpp.moduleId}</artifactId>
+      <version>${project.version}</version>
+      <classifier>${javacpp.platform.macosx-arm64}</classifier>
+    </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>${javacpp.moduleId}</artifactId>
@@ -65,7 +71,7 @@
             <configuration>
               <archive>
                 <manifestEntries>
-                  <Class-Path>${javacpp.moduleId}.jar ${javacpp.moduleId}-linux-x86_64.jar ${javacpp.moduleId}-macosx-x86_64.jar ${javacpp.moduleId}-windows-x86_64.jar</Class-Path>
+                  <Class-Path>${javacpp.moduleId}.jar ${javacpp.moduleId}-linux-x86_64.jar ${javacpp.moduleId}-macosx-arm64.jar ${javacpp.moduleId}-macosx-x86_64.jar ${javacpp.moduleId}-windows-x86_64.jar</Class-Path>
                 </manifestEntries>
               </archive>
             </configuration>
@@ -111,6 +117,7 @@
                   <moduleInfoSource>
                     module org.bytedeco.${javacpp.moduleId}.platform {
                       requires static org.bytedeco.${javacpp.moduleId}.linux.x86_64;
+                      requires static org.bytedeco.${javacpp.moduleId}.macosx.arm64;
                       requires static org.bytedeco.${javacpp.moduleId}.macosx.x86_64;
                       requires static org.bytedeco.${javacpp.moduleId}.windows.x86_64;
                     }

diff --git a/pytorch/pom.xml b/pytorch/pom.xml
@@ -11,7 +11,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>pytorch</artifactId>
-  <version>2.3.0-${project.parent.version}</version>
+  <version>2.4.0-${project.parent.version}</version>
   <name>JavaCPP Presets for PyTorch</name>
 
   <dependencies>
@@ -24,6 +24,12 @@
       <artifactId>openblas</artifactId>
       <version>0.3.28-${project.parent.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.bytedeco</groupId>
+      <artifactId>cuda</artifactId>
+      <version>12.6-9.3-${project.parent.version}</version>
+      <optional>true</optional>
+    </dependency>
   </dependencies>
 
   <build>
@@ -43,6 +49,11 @@
             <artifactId>openblas-platform</artifactId>
             <version>0.3.28-${project.parent.version}</version>
           </dependency>
+          <dependency>
+            <groupId>org.bytedeco</groupId>
+            <artifactId>cuda-platform</artifactId>
+            <version>12.6-9.3-${project.parent.version}</version>
+          </dependency>
           <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>numpy-platform</artifactId>
@@ -60,6 +71,7 @@
             <classPath>${basedir}/../openblas/target/classes/</classPath>
             <classPath>${basedir}/../cpython/target/classes/</classPath>
             <classPath>${basedir}/../numpy/target/classes/</classPath>
+            <classPath>${basedir}/../cuda/target/classes/</classPath>
             <classPath>${project.build.outputDirectory}</classPath>
           </classPaths>
           <includePaths>

diff --git a/pytorch/samples/pom.xml b/pytorch/samples/pom.xml
@@ -12,14 +12,14 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>pytorch-platform</artifactId>
-            <version>2.3.0-1.5.11-SNAPSHOT</version>
+            <version>2.4.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies required to use CUDA, cuDNN, and NCCL -->
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>pytorch-platform-gpu</artifactId>
-            <version>2.3.0-1.5.11-SNAPSHOT</version>
+            <version>2.4.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies to use bundled CUDA, cuDNN, and NCCL -->

diff --git a/pytorch/src/gen/java/org/bytedeco/pytorch/AOTIModelContainerRunner.java b/pytorch/src/gen/java/org/bytedeco/pytorch/AOTIModelContainerRunner.java
@@ -4,7 +4,6 @@
 
 import org.bytedeco.pytorch.Allocator;
 import org.bytedeco.pytorch.Function;
-import org.bytedeco.pytorch.functions.*;
 import org.bytedeco.pytorch.Module;
 import org.bytedeco.javacpp.annotation.Cast;
 import java.nio.*;
@@ -14,6 +13,8 @@
 import static org.bytedeco.javacpp.presets.javacpp.*;
 import static org.bytedeco.openblas.global.openblas_nolapack.*;
 import static org.bytedeco.openblas.global.openblas.*;
+import org.bytedeco.javacpp.chrono.*;
+import static org.bytedeco.javacpp.global.chrono.*;
 
 import static org.bytedeco.pytorch.global.torch.*;
 
@@ -35,9 +36,9 @@ public class AOTIModelContainerRunner extends Pointer {
 
   public native @ByVal ExtraFilesMap getConstantNamesToOriginalFQNs();
   public native @ByVal StringIntMap getConstantNamesToDtypes();
-  public native void update_inactive_constant_buffer(@Cast("const torch::inductor::TensorConstantMap*") @ByRef HashAliasedIValueMap const_map);
+  public native void update_inactive_constant_buffer(@Cast("const torch::inductor::TensorConstantMap*") @ByRef SizeTStringMap const_map);
   public native void update_constant_buffer(
-        @Cast("const torch::inductor::TensorConstantMap*") @ByRef HashAliasedIValueMap const_map,
+        @Cast("const torch::inductor::TensorConstantMap*") @ByRef SizeTStringMap const_map,
         @Cast("bool") boolean use_inactive,
         @Cast("bool") boolean validate_full_updates);
   public native void run_const_fold(