From b75db303fa176cbc02bf7d194d4e458624a3fd9b Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Tue, 6 Jun 2023 16:47:19 -0700 Subject: [PATCH 01/14] Try patchelf to link to add hpcx to path --- CMakeLists.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 443a70a..db95835 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -493,6 +493,19 @@ else() ENDFOREACH(plib) endif() # TRITON_PYTORCH_DOCKER_BUILD + +# Libtorch.so has dependencies in the hpcx folder that must be in the rpath. +install( + CODE + "EXECUTE_PROCESS( + COMMAND patchelf --set-rpath $ORIGIN:/opt/hpcx/ucx/lib/ libtorch.so + RESULT_VARIABLE PATCHELF_STATUS + WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) + if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) + message(FATAL_ERROR \"FAILED: to run patchelf\") + endif()" + ) + install( EXPORT triton-pytorch-backend-targets From 2366d5bbf380288499eda9c4437d7a3cbce15609 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Wed, 7 Jun 2023 09:43:16 -0700 Subject: [PATCH 02/14] Update dependencies after rpath patched --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index db95835..8b3ca39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -498,7 +498,7 @@ endif() # TRITON_PYTORCH_DOCKER_BUILD install( CODE "EXECUTE_PROCESS( - COMMAND patchelf --set-rpath $ORIGIN:/opt/hpcx/ucx/lib/ libtorch.so + COMMAND patchelf --set-rpath $ORIGIN:/opt/hpcx/ucx/lib/ libtorch.so && /sbin/ldconfig RESULT_VARIABLE PATCHELF_STATUS WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) From b323a479379e8f24810fbf7b825c771ce725f93c Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Wed, 7 Jun 2023 10:15:48 -0700 Subject: [PATCH 03/14] Separate ldconfig, use add rpath instead --- CMakeLists.txt | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b3ca39..5a81ee5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -496,15 +496,19 @@ endif() # TRITON_PYTORCH_DOCKER_BUILD # Libtorch.so has dependencies in the hpcx folder that must be in the rpath. install( - CODE - "EXECUTE_PROCESS( - COMMAND patchelf --set-rpath $ORIGIN:/opt/hpcx/ucx/lib/ libtorch.so && /sbin/ldconfig - RESULT_VARIABLE PATCHELF_STATUS - WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) - if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) - message(FATAL_ERROR \"FAILED: to run patchelf\") - endif()" - ) + CODE + "execute_process( + COMMAND patchelf --add-rpath /opt/hpcx/ucx/lib/ libtorch.so + RESULT_VARIABLE PATCHELF_STATUS + WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) + if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) + message(FATAL_ERROR \"FAILED: to run patchelf\") + endif() + + execute_process( + COMMAND /sbin/ldconfig + )" +) install( EXPORT From 5fb75196d521c4d750779da605ff3b02db0ffd01 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Wed, 7 Jun 2023 13:55:21 -0700 Subject: [PATCH 04/14] Add message if PyTorch ldconfig fails --- CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a81ee5..977bfa6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -507,7 +507,11 @@ install( execute_process( COMMAND /sbin/ldconfig - )" + RESULT_VARIABLE LDCONFIG_STATUS + WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) + if(LDCONFIG_STATUS AND NOT LDCONFIG_STATUS EQUAL 0) + message(FATAL_ERROR \"FAILED: to run ldconfig\") + endif()" ) install( From 5a33af074fc8c94bcf0a75cccb028fef919845a6 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Wed, 7 Jun 2023 21:14:51 -0700 Subject: [PATCH 05/14] Remove patchelf --- CMakeLists.txt | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 977bfa6..a3b1e15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,7 +204,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) include/torch include/torchvision COMMAND ${CMAKE_COMMAND} -E make_directory "include/torchvision" - COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} + # COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE} COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:/usr/local/lib/$i $i ; done" @@ -494,26 +494,6 @@ else() endif() # TRITON_PYTORCH_DOCKER_BUILD -# Libtorch.so has dependencies in the hpcx folder that must be in the rpath. -install( - CODE - "execute_process( - COMMAND patchelf --add-rpath /opt/hpcx/ucx/lib/ libtorch.so - RESULT_VARIABLE PATCHELF_STATUS - WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) - if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) - message(FATAL_ERROR \"FAILED: to run patchelf\") - endif() - - execute_process( - COMMAND /sbin/ldconfig - RESULT_VARIABLE LDCONFIG_STATUS - WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) - if(LDCONFIG_STATUS AND NOT LDCONFIG_STATUS EQUAL 0) - message(FATAL_ERROR \"FAILED: to run ldconfig\") - endif()" -) - install( EXPORT triton-pytorch-backend-targets From 1895b09b993820e63742efc9cb03c5e80efa2f41 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Wed, 7 Jun 2023 21:57:01 -0700 Subject: [PATCH 06/14] Copy needed libs to Torch folder --- CMakeLists.txt | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index a3b1e15..8da131b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -233,6 +233,11 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_calib3d.so libopencv_calib3d.so COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_features2d.so libopencv_features2d.so COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_flann.so libopencv_flann.so + COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucc/lib/libucc.so.1 libucc.so.1 + COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucm.so.0 libucm.so.0 + COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucp.so.0 libucp.so.0 + COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucs.so.0 libucs.so.0 + COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libuct.so.0 libuct.so.0 COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0 libpng16.so COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi" @@ -494,6 +499,26 @@ else() endif() # TRITON_PYTORCH_DOCKER_BUILD +# Libtorch.so has dependencies in the hpcx folder that must be in the rpath. +install( + CODE + "execute_process( + COMMAND patchelf --add-rpath /opt/hpcx/ucx/lib/ libtorch.so + RESULT_VARIABLE PATCHELF_STATUS + WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) + if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) + message(FATAL_ERROR \"FAILED: to run patchelf\") + endif() + + execute_process( + COMMAND /sbin/ldconfig + RESULT_VARIABLE LDCONFIG_STATUS + WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) + if(LDCONFIG_STATUS AND NOT LDCONFIG_STATUS EQUAL 0) + message(FATAL_ERROR \"FAILED: to run ldconfig\") + endif()" +) + install( EXPORT triton-pytorch-backend-targets From e9ce30f320fe3a84e2f13e0a46def0bc3263c483 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Wed, 7 Jun 2023 22:35:01 -0700 Subject: [PATCH 07/14] Change build to copy hpcx libs to install dir --- CMakeLists.txt | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8da131b..a104e2f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,6 +139,11 @@ set(PT_LIBS "libtorch_cuda_linalg.so" "libtorch_global_deps.so" "libnvfuser_codegen.so" + "libucc.so.1" + "libucm.so.0" + "libucp.so.0" + "libucs.so.0" + "libuct.so.0" ) if (${TRITON_PYTORCH_ENABLE_TORCHVISION}) @@ -254,7 +259,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}" VERBATIM ) - add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) +add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) add_library(ptlib SHARED IMPORTED GLOBAL) add_dependencies(ptlib ptlib_target) @@ -498,27 +503,6 @@ else() ENDFOREACH(plib) endif() # TRITON_PYTORCH_DOCKER_BUILD - -# Libtorch.so has dependencies in the hpcx folder that must be in the rpath. -install( - CODE - "execute_process( - COMMAND patchelf --add-rpath /opt/hpcx/ucx/lib/ libtorch.so - RESULT_VARIABLE PATCHELF_STATUS - WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) - if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) - message(FATAL_ERROR \"FAILED: to run patchelf\") - endif() - - execute_process( - COMMAND /sbin/ldconfig - RESULT_VARIABLE LDCONFIG_STATUS - WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) - if(LDCONFIG_STATUS AND NOT LDCONFIG_STATUS EQUAL 0) - message(FATAL_ERROR \"FAILED: to run ldconfig\") - endif()" -) - install( EXPORT triton-pytorch-backend-targets From 79c0b13fb7eb3ececaa624a5965001955958d447 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 8 Jun 2023 12:24:46 -0700 Subject: [PATCH 08/14] Only copy HPCX libs when CPU-only --- CMakeLists.txt | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a104e2f..bff4de0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,11 +139,6 @@ set(PT_LIBS "libtorch_cuda_linalg.so" "libtorch_global_deps.so" "libnvfuser_codegen.so" - "libucc.so.1" - "libucm.so.0" - "libucp.so.0" - "libucs.so.0" - "libuct.so.0" ) if (${TRITON_PYTORCH_ENABLE_TORCHVISION}) @@ -194,6 +189,34 @@ set(OPENCV_LIBS "libjpeg.so" ) +# In CPU-only mode, get the hpcx libraries neededby libtorch.so. +if(NOT TRITON_ENABLE_GPU) + set(HPCX_LIBS + "libucc.so.1" + "libucm.so.0" + "libucp.so.0" + "libucs.so.0" + "libuct.so.0" + ) + add_custom_command( + OUTPUT + ${HPCX_LIBS} + # COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} + COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true + COMMAND echo "Running ${TRITON_PYTORCH_DOCKER_IMAGE} to extract HPCX libraries" + COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE} + COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucc/lib/libucc.so.1 libucc.so.1 + COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucm.so.0 libucm.so.0 + COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucp.so.0 libucp.so.0 + COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucs.so.0 libucs.so.0 + COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libuct.so.0 libuct.so.0 + COMMAND ls -la . + COMMAND docker rm pytorch_backend_ptlib + COMMENT "Extracting HPCX libraries from ${TRITON_PYTORCH_DOCKER_IMAGE}" + VERBATIM + ) +endif() # TRITON_ENABLE_GPU + # The patchelf commands ensure the MKL libraries are loaded correctly during runtime # Without these, the framework/backend complains of missing libraries / symbols and # in some cases leads to segmentation faults. @@ -238,11 +261,6 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_calib3d.so libopencv_calib3d.so COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_features2d.so libopencv_features2d.so COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_flann.so libopencv_flann.so - COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucc/lib/libucc.so.1 libucc.so.1 - COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucm.so.0 libucm.so.0 - COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucp.so.0 libucp.so.0 - COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucs.so.0 libucs.so.0 - COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libuct.so.0 libuct.so.0 COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0 libpng16.so COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi" @@ -259,7 +277,8 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}" VERBATIM ) -add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) + message(STATUS "hpcx libs: ${HPCX_LIBS}") + add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${HPCX_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) add_library(ptlib SHARED IMPORTED GLOBAL) add_dependencies(ptlib ptlib_target) @@ -425,7 +444,7 @@ install( if (${TRITON_PYTORCH_DOCKER_BUILD}) set(PT_LIB_PATHS "") - FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) + FOREACH(plib ${PT_LIBS} ${HPCX_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) set(PT_LIB_PATHS ${PT_LIB_PATHS} "${CMAKE_CURRENT_BINARY_DIR}/${plib}") ENDFOREACH(plib) @@ -444,7 +463,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) ) endif() # TRITON_PYTORCH_ENABLE_TORCHTRT - FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) + FOREACH(plib ${PT_LIBS} ${HPCX_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) install( CODE "EXECUTE_PROCESS( @@ -479,7 +498,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) endif()" ) else() - FOREACH(plib ${PT_LIBS}) + FOREACH(plib ${PT_LIBS} ${HPCX_LIBS}) set(PT_LIB_PATHS ${PT_LIB_PATHS} "${TRITON_PYTORCH_LIB_PATHS}/${plib}") ENDFOREACH(plib) @@ -489,7 +508,7 @@ else() DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch ) - FOREACH(plib ${PT_LIBS}) + FOREACH(plib ${PT_LIBS} ${HPCX_LIBS}) install( CODE "EXECUTE_PROCESS( From fe49cc78768ab6c3ccd352c64b0e47a4780ece90 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 8 Jun 2023 12:27:07 -0700 Subject: [PATCH 09/14] Uncomment docker pull --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bff4de0..c94fed1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,7 +201,7 @@ if(NOT TRITON_ENABLE_GPU) add_custom_command( OUTPUT ${HPCX_LIBS} - # COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} + COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true COMMAND echo "Running ${TRITON_PYTORCH_DOCKER_IMAGE} to extract HPCX libraries" COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE} From 523e4c07e5371fb5bdf9e863f6b8cf66fb004d3f Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 8 Jun 2023 12:28:04 -0700 Subject: [PATCH 10/14] Uncomment docker pull, second time --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c94fed1..732b788 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,7 +232,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) include/torch include/torchvision COMMAND ${CMAKE_COMMAND} -E make_directory "include/torchvision" - # COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} + COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE} COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:/usr/local/lib/$i $i ; done" From ba31604908cfc1571b5df308aedf96859265ef6e Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 8 Jun 2023 12:28:32 -0700 Subject: [PATCH 11/14] Remove debugging print --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 732b788..233bdad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -277,7 +277,6 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}" VERBATIM ) - message(STATUS "hpcx libs: ${HPCX_LIBS}") add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${HPCX_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) add_library(ptlib SHARED IMPORTED GLOBAL) add_dependencies(ptlib ptlib_target) From 45d2f9fc5fec1b9fbca33ecdbfb5e2297b47c355 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 8 Jun 2023 12:34:28 -0700 Subject: [PATCH 12/14] Remove debugging statements --- CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 233bdad..c8197f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -210,9 +210,7 @@ if(NOT TRITON_ENABLE_GPU) COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucp.so.0 libucp.so.0 COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucs.so.0 libucs.so.0 COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libuct.so.0 libuct.so.0 - COMMAND ls -la . COMMAND docker rm pytorch_backend_ptlib - COMMENT "Extracting HPCX libraries from ${TRITON_PYTORCH_DOCKER_IMAGE}" VERBATIM ) endif() # TRITON_ENABLE_GPU From 2158177ef57fdb9b0bdababbc00bcbcc7e52068e Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 8 Jun 2023 12:34:42 -0700 Subject: [PATCH 13/14] Remove more debugging statements --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c8197f3..9980634 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,7 +203,6 @@ if(NOT TRITON_ENABLE_GPU) ${HPCX_LIBS} COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true - COMMAND echo "Running ${TRITON_PYTORCH_DOCKER_IMAGE} to extract HPCX libraries" COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE} COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucc/lib/libucc.so.1 libucc.so.1 COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucm.so.0 libucm.so.0 From a23c72ccd6535fd90d841252bbdb3e24ec7ba224 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 8 Jun 2023 15:40:55 -0700 Subject: [PATCH 14/14] Use different name for hpcx container --- CMakeLists.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9980634..03b6387 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -202,14 +202,14 @@ if(NOT TRITON_ENABLE_GPU) OUTPUT ${HPCX_LIBS} COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} - COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true - COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE} - COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucc/lib/libucc.so.1 libucc.so.1 - COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucm.so.0 libucm.so.0 - COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucp.so.0 libucp.so.0 - COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libucs.so.0 libucs.so.0 - COMMAND docker cp -L pytorch_backend_ptlib:/opt/hpcx/ucx/lib/libuct.so.0 libuct.so.0 - COMMAND docker rm pytorch_backend_ptlib + COMMAND docker rm pytorch_backend_hpcxlib || echo "error ignored..." || true + COMMAND docker create --name pytorch_backend_hpcxlib ${TRITON_PYTORCH_DOCKER_IMAGE} + COMMAND docker cp -L pytorch_backend_hpcxlib:/opt/hpcx/ucc/lib/libucc.so.1 libucc.so.1 + COMMAND docker cp -L pytorch_backend_hpcxlib:/opt/hpcx/ucx/lib/libucm.so.0 libucm.so.0 + COMMAND docker cp -L pytorch_backend_hpcxlib:/opt/hpcx/ucx/lib/libucp.so.0 libucp.so.0 + COMMAND docker cp -L pytorch_backend_hpcxlib:/opt/hpcx/ucx/lib/libucs.so.0 libucs.so.0 + COMMAND docker cp -L pytorch_backend_hpcxlib:/opt/hpcx/ucx/lib/libuct.so.0 libuct.so.0 + COMMAND docker rm pytorch_backend_hpcxlib VERBATIM ) endif() # TRITON_ENABLE_GPU