diff --git a/.gitmodules b/.gitmodules index 3ea3f4a0903a0..eb37796f51f0a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -52,7 +52,8 @@ ignore = dirty [submodule "third_party/eigen3"] path = third_party/eigen3 - url = https://gitlab.com/libeigen/eigen.git + url = https://gitlab.com/paipinuo233/eigen.git + branch = support_musa ignore = dirty [submodule "third_party/snappy"] path = third_party/snappy diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d66288ac48580..2855a0dbe674f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -97,7 +97,7 @@ repos: files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$ args: - --extensions=c,cc,cxx,cpp,cu,cuh,h,hpp,hxx,kps - - --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens + - --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens,-whitespace/braces,-build/include - --quiet # Exclude third-party libraries exclude: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 632cf33100c7e..dfbe22ea13911 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,7 @@ option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF) option(WITH_XPU_PLUGIN "Compile PaddlePaddle with BAIDU XPU plugin" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) +option(WITH_MUSA "Compile PaddlePaddle with MUSA platform" OFF) option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF) @@ -89,6 +90,9 @@ endif() if(WITH_GPU AND WITH_ROCM) message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time") endif() +if(WITH_GPU AND WITH_MUSA) + message(FATAL_ERROR "Error when compile CUDA and MUSA at the same time") +endif() if(WITH_GPU AND NOT APPLE) enable_language(CUDA) @@ -346,6 +350,7 @@ if(LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT WITH_GPU AND NOT WITH_ROCM + AND NOT WITH_MUSA AND NOT WITH_XPU AND NOT WITH_XPU_KP AND NOT WITH_XPU_XFT @@ -503,6 +508,31 @@ else() endif() endif() +if(WITH_MUSA) + include(musa) + include(mudnn) +endif() + +if(NOT WITH_MUSA AND WITH_MCCL) + message( + WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.") + set(WITH_MCCL + OFF + CACHE STRING "Disable MCCL when compiling without MUSA" FORCE) +endif() + +if(WITH_MCCL) + add_definitions("-DPADDLE_WITH_MCCL") + include(mccl) +else() + if(WITH_MUSA) + message( + WARNING + "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used." + ) + endif() +endif() + if(WITH_HETERPS AND WITH_PSLIB) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index dc661fce388fe..35e78b01b9bbe 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -175,6 +175,13 @@ elseif(WITH_ROCM) if(${MIOPEN_VERSION} VERSION_LESS 2090) message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile") endif() +elseif(WITH_MUSA) + add_definitions(-DPADDLE_WITH_MUSA) + add_definitions(-DEIGEN_USE_GPU) + add_definitions(-DEIGEN_USE_MUSA) + if(NOT MUDNN_FOUND) + message(FATAL_ERROR "Paddle needs MUDNN to compile") + endif() else() add_definitions(-DHPPL_STUB_FUNC) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 06e37b3c8a602..a981007ba5aa5 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -17,7 +17,7 @@ include(ExternalProject) # update eigen to the commit id f612df27 on 03/16/2021 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3) set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3) -set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee) +set(EIGEN_TAG 6ad1f10acbc311dd82b20cce7f5c305ae8c3eaa9) set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/eigen3) if(WIN32) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 947d44950d52b..56c9c0de2f24b 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -425,6 +425,9 @@ function(cc_binary TARGET_NAME) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) endif() + if(WITH_MUSA) + target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB}) + endif() check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS}) @@ -452,6 +455,9 @@ function(cc_test_build TARGET_NAME) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) endif() + if(WITH_MUSA) + target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB}) + endif() check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS}) endif() endfunction() @@ -775,6 +781,111 @@ function(hip_test TARGET_NAME) endif() endfunction() +function(musa_library TARGET_NAME) + if(WITH_MUSA) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(musa_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + if(musa_library_SRCS) + if(musa_library_SHARED OR musa_library_shared) # build *.so + musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS}) + else() + musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS}) + find_fluid_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) + endif() + if(musa_library_DEPS) + add_dependencies(${TARGET_NAME} ${musa_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${musa_library_DEPS}) + endif() + # cpplint code style + foreach(source_file ${musa_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND musa_library_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + else() + if(musa_library_DEPS) + list(REMOVE_DUPLICATES musa_library_DEPS) + generate_dummy_static_lib( + LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR + "generic.cmake:musa_library") + + target_link_libraries(${TARGET_NAME} ${musa_library_DEPS}) + add_dependencies(${TARGET_NAME} ${musa_library_DEPS}) + else() + message(FATAL "Please specify source file or library in musa_library.") + endif() + endif() + endif() +endfunction() + +function(musa_binary TARGET_NAME) + if(WITH_MUSA) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(musa_binary "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${musa_binary_SRCS}) + if(musa_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${musa_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${musa_binary_DEPS}) + common_link(${TARGET_NAME}) + endif() + endif() +endfunction() + +function(musa_test TARGET_NAME) + if(WITH_MUSA AND WITH_TESTING) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(musa_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + musa_add_executable(${TARGET_NAME} ${musa_test_SRCS}) + # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE + target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries( + ${TARGET_NAME} + ${musa_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + glog + phi + ${os_dependency_modules}) + add_dependencies( + ${TARGET_NAME} + ${musa_test_DEPS} + paddle_gtest_main + lod_tensor + memory + gtest + phi + glog) + common_link(${TARGET_NAME}) + add_test(${TARGET_NAME} ${TARGET_NAME}) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cpu_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT + FLAGS_cudnn_deterministic=true) + set_property( + TEST ${TARGET_NAME} + PROPERTY + ENVIRONMENT + "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH" + ) + endif() +endfunction() + function(xpu_library TARGET_NAME) if(WITH_XPU_KP) set(options STATIC static SHARED shared) @@ -1274,6 +1385,15 @@ function(math_library TARGET) ${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) + elseif(WITH_MUSA) + musa_library( + ${TARGET} + SRCS + ${cc_srcs} + ${cu_srcs} + DEPS + ${math_library_DEPS} + ${math_common_deps}) elseif(${cc_srcs_len} GREATER 0) cc_library( ${TARGET} diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake new file mode 100644 index 0000000000000..fd32500458161 --- /dev/null +++ b/cmake/mccl.cmake @@ -0,0 +1,52 @@ +if(NOT WITH_MUSA) + return() +endif() + +# Now we don't support MCCL on windows +if(WIN32) + return() +endif() + +if(WITH_MCCL) + set(MCCL_ROOT + "/usr/local/musa/" + CACHE PATH "MCCL ROOT") + find_path( + MCCL_INCLUDE_DIR mccl.h + PATHS ${MCCL_ROOT} ${MCCL_ROOT}/include ${MCCL_ROOT}/local/include + $ENV{MCCL_ROOT} $ENV{MCCL_ROOT}/include $ENV{MCCL_ROOT}/local/include + NO_DEFAULT_PATH) + + if(MCCL_INCLUDE_DIR) + file(READ ${MCCL_INCLUDE_DIR}/mccl.h MCCL_VERSION_FILE_CONTENTS) + + string(REGEX MATCH "define MCCL_MAJOR +([0-9]+)" MCCL_MAJOR_VERSION + "${MCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MCCL_MAJOR +([0-9]+)" "\\1" MCCL_MAJOR_VERSION + "${MCCL_MAJOR_VERSION}") + string(REGEX MATCH "define MCCL_MINOR +([0-9]+)" MCCL_MINOR_VERSION + "${MCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MCCL_MINOR +([0-9]+)" "\\1" MCCL_MINOR_VERSION + "${MCCL_MINOR_VERSION}") + string(REGEX MATCH "define MCCL_PATCH +([0-9]+)" MCCL_PATCH_VERSION + "${MCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MCCL_PATCH +([0-9]+)" "\\1" MCCL_PATCH_VERSION + "${MCCL_PATCH_VERSION}") + if(NOT MCCL_MAJOR_VERSION) + set(MCCL_VERSION "???") + else() + math(EXPR MCCL_VERSION "${MCCL_MAJOR_VERSION} * 1000 + + ${MCCL_MINOR_VERSION} * 100 + ${MCCL_PATCH_VERSION}") + endif() + add_definitions("-DMCCL_VERSION_CODE=$MCCL_VERSION") + include_directories(${MCCL_INCLUDE_DIR}) + + message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. ") + message( + STATUS + "Current MCCL version is " + "v${MCCL_MAJOR_VERSION}.${MCCL_MINOR_VERSION}.${MCCL_PATCH_VERSION} ") + else() + message(FATAL_ERROR "WITH_MCCL is enabled but mccl.h file is not found!") + endif() +endif() diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake new file mode 100644 index 0000000000000..81027890d144e --- /dev/null +++ b/cmake/mudnn.cmake @@ -0,0 +1,92 @@ +if(NOT WITH_MUSA) + return() +endif() + +if(WIN32) + return() +else() + set(MUDNN_ROOT + "/usr/local/musa" + CACHE PATH "MUDNN ROOT") +endif() + +find_path( + MUDNN_INCLUDE_DIR mudnn.h + PATHS ${MUDNN_ROOT} ${MUDNN_ROOT}/include $ENV{MUDNN_ROOT} + $ENV{MUDNN_ROOT}/include ${MUSA_TOOLKIT_INCLUDE} + NO_DEFAULT_PATH) + +set(TARGET_ARCH "x86_64") +if(NOT ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif() + +list( + APPEND + MUDNN_CHECK_LIBRARY_DIRS + ${MUDNN_ROOT} + ${MUDNN_ROOT}/lib64 + ${MUDNN_ROOT}/lib + ${MUDNN_ROOT}/lib/x64 + ${MUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu + $ENV{MUDNN_ROOT} + $ENV{MUDNN_ROOT}/lib64 + $ENV{MUDNN_ROOT}/lib + $ENV{MUDNN_ROOT}/lib/x64 + /usr/lib + ${MUSA_TOOLKIT_ROOT_DIR} + ${MUSA_TOOLKIT_ROOT_DIR}/lib/x64) +set(MUDNN_LIB_NAME "") + +if(LINUX) + set(MUDNN_LIB_NAME "libmudnn.so") +endif() + +find_library( + MUDNN_LIBRARY + NAMES ${MUDNN_LIB_NAME} + PATHS ${MUDNN_CHECK_LIBRARY_DIRS} ${MUDNN_INCLUDE_DIR} + NO_DEFAULT_PATH + DOC "Path to muDNN library.") + +if(MUDNN_INCLUDE_DIR AND MUDNN_LIBRARY) + set(MUDNN_FOUND ON) +else() + set(MUDNN_FOUND OFF) +endif() + +macro(find_mudnn_version mudnn_version_file) + file(READ ${mudnn_version_file} MUDNN_VERSION_FILE_CONTENTS) + get_filename_component(MUDNN_LIB_PATH ${MUDNN_LIBRARY} DIRECTORY) + + string(REGEX MATCH "define MUDNN_VERSION_MAJOR +([0-9]+)" MUDNN_MAJOR_VERSION + "${MUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MUDNN_VERSION_MAJOR +([0-9]+)" "\\1" + MUDNN_MAJOR_VERSION "${MUDNN_MAJOR_VERSION}") + string(REGEX MATCH "define MUDNN_VERSION_MINOR +([0-9]+)" MUDNN_MINOR_VERSION + "${MUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MUDNN_VERSION_MINOR +([0-9]+)" "\\1" + MUDNN_MINOR_VERSION "${MUDNN_MINOR_VERSION}") + string(REGEX MATCH "define MUDNN_VERSION_PATCH +([0-9]+)" MUDNN_PATCH_VERSION + "${MUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MUDNN_VERSION_PATCH +([0-9]+)" "\\1" + MUDNN_PATCH_VERSION "${MUDNN_PATCH_VERSION}") + + if(NOT MUDNN_MAJOR_VERSION) + set(MUDNN_VERSION "???") + else() + add_definitions("-DMUDNN_MAJOR_VERSION=\"${MUDNN_MAJOR_VERSION}\"") + math(EXPR MUDNN_VERSION "${MUDNN_MAJOR_VERSION} * 1000 + + ${MUDNN_MINOR_VERSION} * 100 + ${MUDNN_PATCH_VERSION}") + message(STATUS "Current muDNN version file is ${mudnn_version_file} ") + message( + STATUS + "Current muDNN version is v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}.${MUDNN_PATCH_VERSION}. " + ) + endif() +endmacro() + +if(MUDNN_FOUND) + find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn_version.h) + include_directories(${MUDNN_INCLUDE_DIR}) +endif() diff --git a/cmake/musa.cmake b/cmake/musa.cmake new file mode 100644 index 0000000000000..fa1268fbce02b --- /dev/null +++ b/cmake/musa.cmake @@ -0,0 +1,123 @@ +if(NOT WITH_MUSA) + return() +endif() + +if(NOT DEFINED ENV{MUSA_PATH}) + set(MUSA_PATH + "/usr/local/musa" + CACHE PATH "Path to which ROCm has been installed") +else() + set(MUSA_PATH + $ENV{MUSA_PATH} + CACHE PATH "Path to which ROCm has been installed") +endif() +set(CMAKE_MODULE_PATH "${MUSA_PATH}/cmake" ${CMAKE_MODULE_PATH}) + +find_package(MUSA REQUIRED) +include_directories(${MUSA_PATH}/include) + +# set openmp include directory +set(llvm_openmp_search_list) +foreach(item RANGE 6 20 1) + list(APPEND llvm_openmp_search_list /usr/lib/llvm-${item}/include/openmp/) +endforeach() + +find_path( + OPENMP_INCLUDE_DIR omp.h + PATHS ${llvm_openmp_search_list} REQUIRED + NO_DEFAULT_PATH) +include_directories(${OPENMP_INCLUDE_DIR}) + +macro(find_musa_version musa_version_file) + set(python_file ${PROJECT_BINARY_DIR}/get_version.py) + set(MUSA_VERSION + "None" + CACHE STRING "musa version" FORCE) + file( + WRITE ${python_file} + "" + "import json\n" + "import sys\n" + "with open(sys.argv[1], 'r') as f:\n" + " data = json.load(f)\n" + " print(data[\"MUSA_RUNTIME\"][\"version\"])" + "") + + execute_process( + COMMAND "python" "${python_file}" ${musa_version_file} + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE python_res + OUTPUT_VARIABLE python_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(python_res EQUAL 0) + set(MUSA_VERSION ${python_out}) + endif() + string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\1" MUSA_MAJOR_VERSION + "${MUSA_VERSION}") + string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\2" MUSA_MINOR_VERSION + "${MUSA_VERSION}") + string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\3" MUSA_PATCH_VERSION + "${MUSA_VERSION}") + + if(NOT MUSA_MAJOR_VERSION) + set(MUSA_VERSION "???") + message(WARNING "Cannot find MUSA version in ${MUSA_PATH}/version.json") + else() + math( + EXPR + MUSA_VERSION + "${MUSA_MAJOR_VERSION} * 10000 + ${MUSA_MINOR_VERSION} * 100 + ${MUSA_PATCH_VERSION}" + ) + message(STATUS "Current MUSA version file is ${MUSA_PATH}/version.json.") + message( + STATUS + "Current MUSA version is v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION} " + ) + endif() +endmacro() +find_musa_version(${MUSA_PATH}/version.json) + +list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined) +list(APPEND MUSA_MCC_FLAGS -Wno-deprecated-copy-with-user-provided-copy) +list(APPEND MUSA_MCC_FLAGS -Wno-pragma-once-outside-header) +list(APPEND MUSA_MCC_FLAGS -Wno-return-type) +list(APPEND MUSA_MCC_FLAGS -Wno-sign-compare) +list(APPEND MUSA_MCC_FLAGS -Wno-mismatched-tags) +list(APPEND MUSA_MCC_FLAGS -Wno-pessimizing-move) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-but-set-variable) +list(APPEND MUSA_MCC_FLAGS -Wno-bitwise-instead-of-logical) +list(APPEND MUSA_MCC_FLAGS -Wno-format) +list(APPEND MUSA_MCC_FLAGS -Wno-self-assign) +list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion) +list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-value) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef) +list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture) +list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor) +list(APPEND MUSA_MCC_FLAGS -Wno-braced-scalar-init) +list(APPEND MUSA_MCC_FLAGS -Wno-pass-failed) +list(APPEND MUSA_MCC_FLAGS -Wno-missing-braces) +list(APPEND MUSA_MCC_FLAGS -Wno-dangling-gsl) + +if(WITH_CINN) + list(APPEND MUSA_MCC_FLAGS -std=c++14) +else() + list(APPEND MUSA_MCC_FLAGS -std=c++17) +endif() + +list(APPEND MUSA_MCC_FLAGS --cuda-gpu-arch=mp_21) +list(APPEND MUSA_MCC_FLAGS -U__CUDA__) +# MUSA has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer +list(APPEND MUSA_MCC_FLAGS -D__MUSA_NO_HALF_CONVERSIONS__) + +#set(MUSA_VERBOSE_BUILD ON) +if(CMAKE_BUILD_TYPE MATCHES Debug) + list(APPEND MUSA_MCC_FLAGS -g2) + list(APPEND MUSA_MCC_FLAGS -O0) +endif() + +set(musa_runtime_library_name musart) +find_library(MUSARTC_LIB ${musa_runtime_library_name} HINTS ${MUSA_PATH}/lib) +message(STATUS "MUSARTC_LIB: ${MUSARTC_LIB}") diff --git a/cmake/phi.cmake b/cmake/phi.cmake index 3c234c6b93326..c160c2834abbd 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -103,8 +103,8 @@ function(kernel_declare TARGET_LIST) set(first_registry "") endif() endif() - # some gpu kernel only can run on cuda, not support rocm, so we add this branch - if(WITH_ROCM) + # some gpu kernel only can run on cuda, not support rocm and musa, so we add this branch + if(WITH_ROCM OR WITH_MUSA) string(FIND "${first_registry}" "cuda_only" pos) if(pos GREATER 1) set(first_registry "") diff --git a/cmake/version.cmake b/cmake/version.cmake index e6707665a3851..6b1905352bbad 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -86,12 +86,19 @@ function(version version_file) "WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_GPU: ${WITH_GPU}\n" "WITH_ROCM: ${WITH_ROCM}\n" + "WITH_MUSA: ${WITH_MUSA}\n" "WITH_IPU: ${WITH_IPU}\n") if(WITH_GPU) file(APPEND ${version_file} "CUDA version: ${CUDA_VERSION}\n" "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n") endif() + if(WITH_MUSA) + file( + APPEND ${version_file} + "MUSA version: v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION}\n" + "MUDNN version: v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}\n") + endif() if(WITH_ROCM) file(APPEND ${version_file} "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n" diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 82d99a3835230..06c27f1d205c1 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -272,7 +272,8 @@ static std::shared_ptr GetGC( int64_t max_memory_size = framework::GetEagerDeletionThreshold(); std::shared_ptr gc; if (max_memory_size >= 0) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { if (framework::IsFastEagerDeletionModeEnabled()) { gc.reset(new framework::UnsafeFastGPUGarbageCollector(place, diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc index 2e3389af5feb5..02955f46018f6 100644 --- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc @@ -71,7 +71,8 @@ bool CondInterceptor::GetCondResult() { const auto& cond_tensor = cond_var->Get(); bool res = false; if (platform::is_gpu_place(cond_tensor.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::DenseTensor cpu_tensor; framework::TensorCopy(cond_tensor, platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(cond_tensor.place())->Wait(); diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index 4836d656d180f..4b59290c2b87a 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -76,7 +76,8 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, input_data.data.length()); } else if (platform::is_gpu_place(place)) { VLOG(3) << "Loading data for GPU."; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = dynamic_cast(pool.Get(place)); auto gpu_place = place; diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 7567236c4ff68..d3c0df2a11595 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -132,7 +132,8 @@ void ScaleAPI(const paddle::Tensor& x, bias_after_scale, dense_out.get()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } else if (expected_kernel_place == paddle::platform::CUDAPlace()) { auto* dev_ctx = dynamic_cast(pool.Get(expected_kernel_place)); diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index 7fe53febc5a9b..b96b997976be4 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -124,7 +124,7 @@ def FindParsingFunctionFromAttributeType(atype): FUNCTION_SET_DEVICE_TEMPLATE = """{} SetPythonStack(); if (paddle::platform::is_gpu_place(place)) {{ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) phi::backends::gpu::SetDeviceId(place.device); VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device; #else diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index a66bc211d513c..03d0bfbf5ed23 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -98,7 +98,8 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { auto& place = dense_tensor->place(); if (paddle::platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) paddle::framework::details::tensor_check( api_name, tensor_name, *dense_tensor, place); #else diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h index 1620c99ce8560..aca4ce5f23d8c 100644 --- a/paddle/fluid/framework/conv_search_cache.h +++ b/paddle/fluid/framework/conv_search_cache.h @@ -45,6 +45,8 @@ class ConvSearchCache { AlgorithmsCache* GetConvFusion() { return &fusion_forward_cache_; } +#elif defined(PADDLE_WITH_MUSA) + #else AlgorithmsCache* GetForward() { return &forward_cache_; @@ -72,6 +74,8 @@ class ConvSearchCache { AlgorithmsCache backward_data_cache_; AlgorithmsCache backward_filter_cache_; AlgorithmsCache fusion_forward_cache_; +#elif defined(PADDLE_WITH_MUSA) + #else AlgorithmsCache forward_cache_; AlgorithmsCache backward_data_cache_; diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc index 10e0b76f00459..b2f389bb965a0 100644 --- a/paddle/fluid/framework/copy_same_tensor_test.cc +++ b/paddle/fluid/framework/copy_same_tensor_test.cc @@ -32,7 +32,8 @@ namespace framework { static std::vector CreatePlaceList() { std::vector places; places.emplace_back(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) places.emplace_back(platform::CUDAPlace(0)); #endif return places; diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index ebfed9a6f73f6..44cfbf77ea6c2 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -123,7 +123,8 @@ static void RunKernelFunc( "Input tensor (%s) is not initialized.", in_name)); paddle::Tensor custom_in; custom_in.set_impl(std::make_shared(*x)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (custom_in.is_gpu_pinned()) { VLOG(3) << "Custom Operator: custom input is gpu pinned tensor"; auto gpu_place = phi::GPUPlace(platform::GetCurrentDeviceId()); @@ -1174,7 +1175,8 @@ static void RegisterOperatorKernel( } RegisterOperatorKernelWithPlace( name, op_kernel_func, proto::VarType::RAW, platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) RegisterOperatorKernelWithPlace( name, op_kernel_func, proto::VarType::RAW, platform::CUDAPlace()); #endif diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 32c4845bd0d57..bd03b7cf4c59c 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -1526,7 +1526,9 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( #endif } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ + !defined(_WIN32) template void PrivateInstantDataFeed::PutToFeedVec() { for (size_t i = 0; i < use_slots_.size(); ++i) { diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 1057640842c2c..b3ba9b7fd4fdd 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -1951,7 +1951,9 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { int pv_batch_size_; }; -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ + !defined(_WIN32) template class PrivateInstantDataFeed : public DataFeed { public: diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc index e058b19469000..887de75181709 100644 --- a/paddle/fluid/framework/data_feed_factory.cc +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -70,7 +70,9 @@ REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed); REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ + !defined(_WIN32) REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed); #endif } // namespace framework diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 9d114fcf56396..b2fb089f53574 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -101,7 +101,7 @@ struct CastDataType { in_end, out_begin, CastDataTypeFunctor()); -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) } else if (platform::is_gpu_place(in_.place())) { phi::Transform trans; auto* context = static_cast(ctx_); diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 4d9a88cf22372..9f146d960b026 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -202,6 +202,94 @@ elseif(WITH_ROCM) fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) +elseif(WITH_MUSA) + musa_library( + nan_inf_utils + SRCS + nan_inf_utils_detail.cc + DEPS + framework_proto + scope + place + phi) + musa_library( + all_reduce_op_handle + SRCS + all_reduce_op_handle.cc + DEPS + op_handle_base + scope + lod_tensor + phi + memory + dynload_cuda + variable_visitor) + musa_library( + fused_all_reduce_op_handle + SRCS + fused_all_reduce_op_handle.cc + DEPS + all_reduce_op_handle + op_handle_base + variable_visitor + scope + lod_tensor + phi + memory + dynload_cuda + place) + musa_library( + grad_merge_all_reduce_op_handle + SRCS + grad_merge_all_reduce_op_handle.cc + DEPS + fused_all_reduce_op_handle + op_handle_base + scope + lod_tensor + phi + memory + dynload_cuda + variable_visitor + place + all_reduce_op_handle) + + if(WITH_DISTRIBUTE) + musa_library( + reduce_op_handle + SRCS + reduce_op_handle.cc + DEPS + op_handle_base + variable_visitor + scope + phi + dynload_cuda) + else() + musa_library( + reduce_op_handle + SRCS + reduce_op_handle.cc + DEPS + op_handle_base + variable_visitor + scope + phi + dynload_cuda) + endif() + musa_library( + broadcast_op_handle + SRCS + broadcast_op_handle.cc + DEPS + op_handle_base + scope + phi + memory + variable_visitor + dynload_cuda) + musa_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS + broadcast_op_handle) else() cc_library( nan_inf_utils @@ -386,7 +474,9 @@ endif() if(NOT APPLE AND NOT WIN32 - AND (WITH_GPU OR WITH_ROCM)) + AND (WITH_GPU + OR WITH_ROCM + OR WITH_MUSA)) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) endif() cc_library( diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 69f7a49ce55fd..73707458c073a 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -186,7 +186,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { "fuse_relu_depthwise_conv_pass"); AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass"); AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass"); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); #endif @@ -545,7 +546,8 @@ USE_PASS(fused_feedforward_pass); #ifdef PADDLE_WITH_MKLDNN USE_PASS(mkldnn_placement_pass); #endif -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ !defined(_WIN32) && !defined(__APPLE__) USE_PASS(fusion_group_pass); #endif diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 4012263f688cb..2c4b73d73b56d 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,7 +16,8 @@ #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include @@ -44,7 +45,8 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( place_(place), var_infos_(vars.begin(), vars.end()), gc_(gc) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); @@ -53,6 +55,9 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&event_, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); @@ -75,12 +80,15 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( } EagerDeletionOpHandle::~EagerDeletionOpHandle() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (event_) { auto gpu_place = dev_ctx_->GetPlace(); platform::CUDADeviceGuard guard(gpu_place.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_)); #endif @@ -89,7 +97,8 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { } void EagerDeletionOpHandle::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) int dev_id = dev_ctxes_.begin()->first.device; events_[dev_id] = nullptr; #endif @@ -177,7 +186,8 @@ void EagerDeletionOpHandle::RunImpl() { void EagerDeletionOpHandle::ClearGarbages( std::deque> *garbages) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (event_) { auto compute_stream = dev_ctx_->stream(); auto callback_stream = @@ -187,6 +197,10 @@ void EagerDeletionOpHandle::ClearGarbages( PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(callback_stream, event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(callback_stream, event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( @@ -197,7 +211,8 @@ void EagerDeletionOpHandle::ClearGarbages( } else { #endif gc_->Add(std::move(*garbages)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } #endif } diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index 0a92269c50ad2..e08267938b822 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -80,7 +80,8 @@ class EagerDeletionOpHandle : public OpHandleBase { std::vector var_infos_; // not own GarbageCollector *gc_; // not own std::vector vars_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::GPUContext *dev_ctx_{nullptr}; gpuEvent_t event_{nullptr}; #endif diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc index 9fd6a08e02302..f14bb44a76cd4 100644 --- a/paddle/fluid/framework/details/fetch_async_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc @@ -135,7 +135,8 @@ static void TransData(const phi::DenseTensor *src_item, const platform::DeviceContext &ctx) { if (src_item->IsInitialized() && src_item->numel() > 0) { if (platform::is_gpu_place(src_item->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item); #endif } else { diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index a36b63da9b8b6..d522791c13875 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -121,7 +121,8 @@ static void TransData(const phi::DenseTensor &src_item, phi::DenseTensor *dst_item) { if (src_item.IsInitialized() && src_item.numel() > 0) { if (platform::is_gpu_place(src_item.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) TensorCopy(src_item, platform::CPUPlace(), dst_item); #endif } else { diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 29d5697b23f0d..f3ad442609171 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -32,7 +32,8 @@ typedef std::vector< std::vector>> GradientAndLoDTensor; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_MCCL) FusedAllReduceOpHandle::FusedAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, @@ -61,11 +62,14 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle( #endif FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_MCCL) auto destroy_event = [](gpuEvent_t event) { if (event == nullptr) return; #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif @@ -103,6 +107,9 @@ void FusedAllReduceOpHandle::RunImpl() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(event, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(event, cudaEventDisableTiming)); @@ -126,6 +133,10 @@ void FusedAllReduceOpHandle::RunImpl() { PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(nccl_stream, start_event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(nccl_stream, start_event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS( @@ -185,12 +196,17 @@ void FusedAllReduceOpHandle::RunImpl() { FusedAllReduceFunc(in_var_handles, out_var_handles); } -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_MCCL) if (FLAGS_allreduce_record_one_event) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(compute_stream, end_event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(compute_stream, end_event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream)); PADDLE_ENFORCE_GPU_SUCCESS( diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index 3437eb5570dc7..1f0895ae7dd30 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -47,7 +47,8 @@ struct TestGatherOpHandle { void InitCtxOnGpu(bool use_gpu) { if (use_gpu) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) int count = p::GetGPUDeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -224,7 +225,8 @@ TEST(GatherTester, TestCPUGatherTestSelectedRows) { test_op.TestGatherSelectedRows(input_scope_idx); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) TEST(GatherTester, TestGPUGatherTestSelectedRows) { TestGatherOpHandle test_op; diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 80c029a5fd976..fa3e72ab75cd1 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -183,7 +183,8 @@ void CheckVarHasNanOrInf(const std::string& op_type, << ", place:" << tensor->place() << ", numel:" << tensor->numel(); if (platform::is_gpu_place(tensor->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) tensor_check(op_type, var_name, *tensor, place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 82f09f51c23e1..c880e6abf5b1c 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -31,11 +31,14 @@ std::string OpHandleBase::DebugString() const { } OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) for (auto &ev : events_) { if (ev.second) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second)); #endif @@ -45,13 +48,17 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { } void OpHandleBase::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) for (auto &p : dev_ctxes_) { int dev_id = p.first.device; platform::SetDeviceId(dev_id); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&events_[dev_id], musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); @@ -136,7 +143,8 @@ void OpHandleBase::InitXPU() { } void OpHandleBase::Run(DeviceType use_device) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) { InitCUDA(); } @@ -172,7 +180,8 @@ void OpHandleBase::Run(DeviceType use_device) { } void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_NOT_NULL( waited_ctx, platform::errors::InvalidArgument("Argument waited_ctx is NULL.")); @@ -188,6 +197,8 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { for (auto &ev : events_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream, ev.second, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0)); #endif @@ -221,12 +232,16 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (in_var_handle) { auto &place = in_var_handle->place(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto stream = static_cast(dev_ctxes_.at(place))->stream(); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); @@ -248,7 +263,8 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (in_var_handle) { auto &place = in_var_handle->place(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto stream = @@ -273,13 +289,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { auto *in_var_handle = dynamic_cast(in_var); if (in_var_handle) { if (platform::is_gpu_place(in_var_handle->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto stream = static_cast( dev_ctxes_.at(in_var_handle->place())) ->stream(); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); @@ -311,7 +331,8 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { callback(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (!events_.empty()) { // Use event for (auto &p : dev_ctxes_) { auto dev_id = p.first.device; @@ -320,6 +341,9 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); @@ -331,7 +355,8 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { void OpHandleBase::RunAndRecordEvent(platform::Place p, const std::function &callback) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_cpu_place(p) || events_.empty()) { callback(); } else { diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 9afe56e4babd4..b9411082e2dce 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -161,7 +161,8 @@ class OpHandleBase { // See https://github.com/PaddlePaddle/Paddle/pull/32283 bool is_variant_scope_ = false; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::unordered_map events_; #endif diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 7587fb6553cd7..bb9fbd605aeca 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -303,7 +303,8 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) { test_op.InitReduceOp(out_scope_idx); test_op.TestReduceLodTensors(out_scope_idx); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) TEST(ReduceTester, TestGPUReduceTestSelectedRows) { TestReduceOpHandle test_op; diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 9dac1a7203f8d..11490d85d183f 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -76,7 +76,8 @@ struct ScaleLossGradFunctor { "Please recompile or reinstall Paddle with XPU support.")); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) OutT cast_coeff = static_cast(coeff_); auto stream = static_cast(ctx_)->stream(); memory::Copy(place_, @@ -110,7 +111,8 @@ void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) { auto *tensor = var->GetMutable(); tensor->Resize(phi::make_ddim({1})); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) ScaleLossGradFunctor func( coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_)); if (record_event) { diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc index 02a68fb697efb..dce9d9ab621bb 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc @@ -95,7 +95,8 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype( } void ShareTensorBufferOpHandle::InitCUDA() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) int dev_id = dev_ctxes_.begin()->first.device; events_[dev_id] = nullptr; #endif diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index a6314220d5c26..c78267882aaaf 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -129,7 +129,8 @@ struct VarHandle : public VarHandleBase { name_(std::move(name)), place_(std::move(place)) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) bool HasEvent() { return has_event_; } const gpuEvent_t& GetEvent() { @@ -154,7 +155,8 @@ struct VarHandle : public VarHandleBase { size_t scope_idx_; std::string name_; platform::Place place_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // Only when this event is triggered, var is generated. gpuEvent_t event_; bool has_event_{false}; diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 25d29e469a498..3b3a51b234de4 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -84,12 +84,13 @@ class PullDenseWorker { public: virtual ~PullDenseWorker() {} virtual void Initialize(const TrainerDesc& param); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); } #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) void AddPlace(const paddle::platform::Place place) { places_.push_back(place); } @@ -154,7 +155,8 @@ class PullDenseWorker { float total_batch_num_ = 0; std::unordered_map scope_to_thread_id_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::vector copy_streams_; #endif std::vector places_; @@ -185,7 +187,8 @@ class DeviceWorker { virtual void ProduceTasks() {} virtual void GetXpuOpIndex() {} virtual void Schedule(int taskid UNUSED) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) virtual void SetStream(const gpuStream_t stream UNUSED) {} virtual void SetEvent(const gpuEvent_t event UNUSED) {} #endif @@ -561,7 +564,8 @@ class PSGPUWorker : public HogwildWorker { new (&program_) ProgramDesc(main_program); } void ProduceTasks() override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } virtual void SetEvent(const gpuEvent_t event) { event_ = event; } #endif @@ -629,7 +633,8 @@ class PSGPUWorker : public HogwildWorker { std::unordered_map> feasign_set_; paddle::framework::Channel> pull_queue_; paddle::framework::Channel> push_queue_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gpuEvent_t event_; gpuStream_t copy_stream_; #endif @@ -802,7 +807,8 @@ class HeterSectionWorker : public DeviceWorker { Scope* GetThreadScope() override { return minibatch_scope_; } // multi-stream - // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || + // defined(PADDLE_WITH_MUSA) // void SetStream(const gpuStream_t stream) override {} // void SetEvent(const gpuEvent_t event) override {} // #endif diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 46b917cda740a..a55e640c0be32 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -96,7 +96,8 @@ struct DLDeviceVisitor { } inline ::DLDevice operator()(const platform::CUDAPlace &place) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) ::DLDevice device; device.device_type = kDLGPU; device.device_id = place.device; @@ -108,7 +109,8 @@ struct DLDeviceVisitor { } inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) ::DLDevice device; device.device_type = kDLCPUPinned; device.device_id = 0; diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index f6b28b0a22ebc..af3368de4dc8e 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -108,7 +108,8 @@ void TestToDLManagedTensor(const platform::Place &place, uint16_t lanes) { template void TestMainLoop() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::vector places{platform::CPUPlace(), platform::CUDAPlace(0), platform::CUDAPinnedPlace()}; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index e0ad2255743c4..b3f8525998257 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -492,7 +492,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, std::unique_ptr gc; if (!ctx->force_disable_gc_ && max_memory_size >= 0) { if (platform::is_gpu_place(place_)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size)); } else { diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index 5f46906cf8e82..e370631443d56 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -161,6 +161,11 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, values.data(), values.size() * sizeof(float*), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(gpu_values, + values.data(), + values.size() * sizeof(float*), + musaMemcpyHostToDevice); #else cudaMemcpy(gpu_values, values.data(), diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index 9853c328cd14e..b3432277805a7 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -595,6 +595,9 @@ class BoxWrapper { data->resize(len); #ifdef PADDLE_WITH_HIP hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + data->data(), gpu_data, sizeof(T) * len, musaMemcpyDeviceToHost); #else cudaMemcpy( data->data(), gpu_data, sizeof(T) * len, cudaMemcpyDeviceToHost); diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index d72e418aadd3e..cba6da070ac55 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -44,7 +44,9 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in PaddleBox now.")); } else if (platform::is_gpu_place(place)) { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ + !defined(_WIN32) VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; int device_id = place.GetDeviceId(); phi::DenseTensor& total_keys_tensor = keys_tensor[device_id]; @@ -70,6 +72,15 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, slot_lengths_lod.data(), slot_lengths.size() * sizeof(int64_t), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy(gpu_keys, + keys.data(), + keys.size() * sizeof(uint64_t*), + musaMemcpyHostToDevice); + musaMemcpy(gpu_len, + slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), + musaMemcpyHostToDevice); #else cudaMemcpy(gpu_keys, keys.data(), @@ -153,7 +164,9 @@ void BoxWrapper::PushSparseGradCase( PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in PaddleBox now.")); } else if (platform::is_gpu_place(place)) { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ + !defined(_WIN32) int device_id = place.GetDeviceId(); phi::DenseTensor& cached_total_keys_tensor = keys_tensor[device_id]; uint64_t* total_keys = diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 05433c1014656..75adf94e1ce61 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -784,7 +784,8 @@ void FleetWrapper::PushDenseVarsSync( const uint64_t table_id, const std::vector& var_names) {} -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ (defined PADDLE_WITH_PSLIB) void FleetWrapper::PushDenseVarsAsync( const Scope& scope, @@ -816,6 +817,9 @@ void FleetWrapper::PushDenseVarsAsync( #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream)); hipEventSynchronize(event); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, stream)); + musaEventSynchronize(event); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream)); cudaEventSynchronize(event); diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index fb5cf91729256..7fa90285e4fb3 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -175,7 +175,8 @@ class FleetWrapper { // Push dense variables to server in async mode // Param: scope, table_id, var_names, scale_datanorm, batch_size // Param: push_sparse_status -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void PushDenseVarsAsync( const Scope& scope, const uint64_t table_id, diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc index 2cae0721aefa9..b00b25b4eab8d 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.cc +++ b/paddle/fluid/framework/fleet/heter_wrapper.cc @@ -121,7 +121,8 @@ void HeterWrapper::SerializeToReq(const std::string& varname, tensor->numel() * SizeOfType(framework::TransToProtoVarType(tensor->dtype()))); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(), @@ -141,7 +142,8 @@ void HeterWrapper::SerializeToReq(const std::string& varname, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void HeterWrapper::DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, platform::Place place, @@ -169,7 +171,8 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, void* tensor_data = tensor->mutable_data( place, framework::TransToPhiDataType(ToVarType(req_var.data_type()))); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) memory::Copy(place, tensor_data, platform::CPUPlace(), diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h index 77838fbec6d00..ec4bc3a984c2c 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_wrapper.h @@ -92,7 +92,8 @@ class HeterWrapper { framework::proto::VarType::Type ToVarType(VariableMessage::Type type); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, platform::Place place, diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 3296679e1eeeb..f49ca915fb674 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -13,7 +13,8 @@ // limitations under the License. #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "gflags/gflags.h" @@ -64,7 +65,8 @@ void IPUGarbageCollector::ClearCallback(const std::function &callback) { } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) {} @@ -93,6 +95,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, platform::CUDADeviceGuard guard(place.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_)); callback_manager_.reset( diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index f3d9ec54e6968..5150c91d0af0c 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -85,7 +85,8 @@ class IPUGarbageCollector : public GarbageCollector { }; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 1c186373cdbb5..e80a292d17f92 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -3,7 +3,9 @@ add_subdirectory(memory_optimize_pass) add_subdirectory(multi_devices_graph_pass) if(NOT APPLE AND NOT WIN32 - AND (WITH_GPU OR WITH_ROCM)) + AND (WITH_GPU + OR WITH_ROCM + OR WITH_MUSA)) add_subdirectory(fusion_group) endif() @@ -159,7 +161,9 @@ if(WITH_TENSORRT) pass_library(split_layernorm_to_math_ops_pass inference) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU + OR WITH_ROCM + OR WITH_MUSA) pass_library(cudnn_placement_pass base DEPS placement_pass_base) pass_library(embedding_eltwise_layernorm_fuse_pass inference) endif() diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc index 9ca3190fd092f..49b96836cfbbf 100644 --- a/paddle/fluid/framework/ir/cost_model.cc +++ b/paddle/fluid/framework/ir/cost_model.cc @@ -128,7 +128,8 @@ bool CostData::SetCostData(const ProgramDesc& program, double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs( main_thread_events[op_pop_index]); double gpu_time_ms = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs( main_thread_events[op_pop_index]); #endif @@ -152,7 +153,8 @@ bool CostData::SetCostData(const ProgramDesc& program, double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs( main_thread_events[stop_profiler_idx]); double gpu_time_ms = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs( main_thread_events[stop_profiler_idx]); #endif diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc index 299e700edb95d..951c861bb7a4b 100644 --- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc @@ -34,8 +34,10 @@ namespace framework { namespace ir { void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ + CUDNN_VERSION_MIN(7, 4, 1) // forward std::unordered_set act_types = {"relu"}; graph = FuseBatchNormAct(graph, act_types); diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc index 506e8721298b6..df5cbfa9e7e0b 100644 --- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc @@ -25,7 +25,8 @@ namespace framework { namespace ir { void FuseBatchNormAddActPass::ApplyImpl(ir::Graph *graph) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1) // forward std::unordered_set act_types = {"relu"}; diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index 06593733e6a27..9cffdaed6a59e 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -27,7 +27,8 @@ namespace phi { class DenseTensor; } // namespace phi -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index a0f1d9eed0038..67b154989d346 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -203,7 +203,8 @@ TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) { {}); std::vector use_cuda_list{false}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) use_cuda_list.push_back(true); #endif for (auto use_cuda : use_cuda_list) { diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc index 1e6a6f02e2230..dcd12bed40ad4 100644 --- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc +++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc @@ -53,7 +53,8 @@ inline std::tuple GetThreadPoolConfig(const phi::Place& place, processor_count = std::thread::hardware_concurrency(); if (processor_count) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) device_count = phi::backends::gpu::GetGPUDeviceCount(); #endif } diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 13896b66f3c55..16398806597e3 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -641,7 +641,8 @@ void BuildOpFuncList(const platform::Place& place, *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context); auto expected_kernel_key = framework::TransPhiKernelKeyToOpKernelType( op_with_kernel->GetExpectedKernelType(exec_ctx)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (op_with_kernel->CanCUDNNBeUsed(exec_ctx, expected_kernel_key.data_type_)) { expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index 1ae7e5e59ce1f..6680af7eb3206 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -48,7 +48,8 @@ DECLARE_bool(benchmark); DECLARE_uint64(executor_log_deps_every_microseconds); PHI_DECLARE_bool(new_executor_use_cuda_graph); PHI_DECLARE_bool(enable_new_ir_in_executor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PHI_DECLARE_bool(sync_nccl_allreduce); #endif diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 3b40a3b0727f1..9382d7a4bd090 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -892,7 +892,8 @@ void NewIRInterpreter::RunOperator(const Instruction& instr_node) { /*For profiling/benchmark only*/ if (FLAGS_benchmark) { instr_node.DeviceContext().Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op->Type() << "): context wait and get last error"; @@ -1245,7 +1246,8 @@ void NewIRInterpreter::RecordStreamForGC(const Instruction& instr) { void NewIRInterpreter::CheckGC(const Instruction& instr) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) RecordStreamForGC(instr); #endif auto& var_scope = var_scope_; diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h index 95eee77d36288..d72f201a9e02b 100644 --- a/paddle/fluid/framework/new_executor/profiler.h +++ b/paddle/fluid/framework/new_executor/profiler.h @@ -42,7 +42,8 @@ class ProfilerGuard { private: void TotalCUDAAllocatedMemorySize(const platform::Place& place) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto cuda_place = place; cost_info_->device_memory_bytes = platform::RecordedGpuMallocSize(cuda_place.device); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index b6c54192a6970..d14bc40d32217 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -880,7 +880,8 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) { /*For profiling/benchmark only*/ if (FLAGS_benchmark) { instr_node.DeviceContext().Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op->Type() << "): context wait and get last error"; @@ -1232,7 +1233,8 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) { void ProgramInterpreter::CheckGC(const Instruction& instr) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) RecordStreamForGC(instr); #endif auto& var_scope = var_scope_; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 8cb29a0d5df4c..8bc37165e67b8 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -358,7 +358,8 @@ struct OpKernelRegistrarFunctorExCanCUDNNBeUsed(exe_ctx, kernel_type.data_type_)) { auto tmp_kernel_type = kernel_type; tmp_kernel_type.library_type_ = framework::LibraryType::kCUDNN; @@ -1544,7 +1546,8 @@ bool OperatorWithKernel::CanCUDNNBeUsed(const framework::ExecutionContext& ctx, bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr("use_cudnn") && paddle::platform::is_gpu_place(ctx.GetPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (use_cudnn) { auto& dev_ctx = ctx.device_context(); use_cudnn &= (dev_ctx.cudnn_handle() != nullptr); @@ -1783,7 +1786,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (this->CanCUDNNBeUsed(exe_ctx, kernel_type_->data_type_)) { kernel_type_->library_type_ = framework::LibraryType::kCUDNN; } @@ -2109,7 +2113,8 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (this->CanCUDNNBeUsed(ctx, expected_kernel_key.data_type_)) { expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; } @@ -2132,7 +2137,8 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( // CPUKernel will be executed and a warning will be given at the same // time. expected_kernel_key.place_ = platform::CPUPlace(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (SupportGPU()) { auto& dev_ctx = ctx.device_context(); expected_kernel_key.place_ = dev_ctx.GetPlace(); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e6a2058107b1d..50802e83fd7fa 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -575,7 +575,8 @@ class ExecutionContext : public phi::KernelContext { return device_context_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) const inline phi::GPUContext& cuda_device_context() const { PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 806b8570108b9..b85a7bb0fa381 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -41,14 +41,16 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/platform/flags.h" PHI_DECLARE_double(eager_delete_tensor_gb); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PHI_DECLARE_bool(sync_nccl_allreduce); #endif @@ -69,7 +71,8 @@ static std::once_flag gProfileOnce; static bool gProfileStarted = false; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::once_flag p2p_init_flag; #endif @@ -512,7 +515,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { } std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector(place, max_memory_size)); } else { @@ -621,7 +625,8 @@ bool ParallelExecutor::NeedCreateLocalExeScope() { } void InitP2P(const std::vector &places) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::call_once(p2p_init_flag, [&]() { int count = places.size(); if (count <= 1) return; @@ -642,6 +647,10 @@ void InitP2P(const std::vector &places) { hipError_t ret = hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); if (ret != hipSuccess || can_acess != 1) { +#elif defined(PADDLE_WITH_MUSA) + musaError_t ret = + musaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); + if (ret != musaSuccess || can_acess != 1) { #else cudaError_t ret = cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); @@ -653,6 +662,8 @@ void InitP2P(const std::vector &places) { platform::CUDADeviceGuard guard(devices[i]); #ifdef PADDLE_WITH_HIP hipDeviceEnablePeerAccess(devices[j], 0); +#elif defined(PADDLE_WITH_MUSA) + musaDeviceEnablePeerAccess(devices[j], 0); #else cudaDeviceEnablePeerAccess(devices[j], 0); #endif @@ -1299,7 +1310,9 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( BuildStrategy::ReduceStrategy::kAllReduce; member_->use_all_reduce_ = true; } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ + defined(_WIN32) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( device_count, @@ -1308,7 +1321,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( } #endif -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( @@ -1674,7 +1688,8 @@ std::vector ParallelExecutor::CreateSSAGraphExecutor( final_graphs = *async_graphs; } else if (member_->build_strategy_.enable_parallel_graph_) { VLOG(3) << "use ParallelSSAGraphExecutor"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // TODO(Yancey1989): Remove passing in the main_program when // allreduce_seq_pass doesn't need it as the attr. bool is_inference = details::IsDataParallelInferenceGraph(*graph); diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 9881d479a75a2..070c85d425ee0 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -134,7 +134,8 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key, phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (kernel_key.backend() == phi::Backend::GPU || kernel_key.backend() == phi::Backend::GPUDNN) { PADDLE_THROW( diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index f8589e95ff8e9..33493669755e9 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -72,7 +72,8 @@ struct ConvertToPhiContext { using TYPE = phi::CPUContext; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template <> struct ConvertToPhiContext { using TYPE = phi::GPUContext; diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc index 7b61052a20151..40296242b1927 100644 --- a/paddle/fluid/framework/pull_dense_worker.cc +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -69,11 +69,12 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { fleet_ptr_ = FleetWrapper::GetInstance(); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) copy_streams_.clear(); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) places_.clear(); thread_scopes_.clear(); #endif @@ -81,7 +82,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { void PullDenseWorker::CreatePinVar() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) // for (auto& v : dense_value_names_) { // for (auto& name : v.second) { for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size(); @@ -96,7 +97,8 @@ void PullDenseWorker::CreatePinVar() { auto* ptr = root_scope_->Var(name + "pin"); InitializeVariable(ptr, proto::VarType::LOD_TENSOR); phi::DenseTensor* pin_tensor = ptr->GetMutable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) pin_tensor->mutable_data(tensor->dims(), platform::CUDAPinnedPlace()); #endif @@ -126,7 +128,7 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { } status_vec->resize(0); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) for (size_t i = 0; i < places_.size(); ++i) { // for (auto& v : dense_value_names_) { @@ -144,7 +146,8 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { Variable* var = thread_scopes_[i]->FindVar(name); phi::DenseTensor* tensor = var->GetMutable(); float* w = tensor->data(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) memory::Copy(places_[i], w, platform::CUDAPinnedPlace(), @@ -180,7 +183,7 @@ void PullDenseWorker::PullDense(bool force_update) { dwp_param_.program_config(0).pull_dense_table_id(i)); if (force_update || CheckUpdateParam(tid)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) VLOG(3) << "pull dense " << force_update << " " << tid; fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 58e879a5011c2..d2f8a9f955608 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -228,7 +228,8 @@ void SectionWorker::TrainFiles() { int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; if (max_memory_size >= 0) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size)); diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 5ef6f53d38d50..37378d4d3a161 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -114,7 +114,8 @@ TEST(DenseTensor, MutableData) { EXPECT_EQ(static_cast(p2[0]), 1); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; float* p1 = nullptr; @@ -168,7 +169,8 @@ TEST(DenseTensor, ShareDataWith) { ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; phi::DenseTensor dst_tensor; @@ -206,7 +208,8 @@ TEST(DenseTensor, Slice) { EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; src_tensor.mutable_data(phi::make_ddim({6, 9}), @@ -295,7 +298,8 @@ TEST(DenseTensor, Split) { EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address); } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; src_tensor.mutable_data(phi::make_ddim({6, 4}), @@ -357,7 +361,8 @@ TEST(DenseTensor, Chunk) { EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address); } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; src_tensor.mutable_data(phi::make_ddim({6, 4}), diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index d8224cb0dd72b..df8bfcbb5d473 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -124,7 +124,8 @@ void TensorCopyImpl(const TENSOR& src, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); @@ -377,7 +378,8 @@ void TensorCopySync(const phi::DenseTensor& src, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); @@ -479,7 +481,8 @@ void TensorToStream(std::ostream& os, platform::errors::ResourceExhausted( "tensor size %d overflow when writing tensor", size)); if (platform::is_gpu_place(tensor.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB std::unique_ptr buf(new char[kBufSize]); auto& gpu_dev_ctx = static_cast(dev_ctx); @@ -614,7 +617,8 @@ void TensorFromStream(std::istream& is, platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) || \ + defined(PADDLE_WITH_CUSTOM_DEVICE) phi::DenseTensor cpu_tensor; cpu_tensor.Resize(phi::make_ddim(shape)); framework::VisitDataType( @@ -687,7 +691,8 @@ void TensorFromStream(std::istream& is, platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) || \ + defined(PADDLE_WITH_CUSTOM_DEVICE) phi::DenseTensor cpu_tensor; cpu_tensor.Resize(phi::make_ddim(dims)); framework::VisitDataType( @@ -809,7 +814,8 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) { if (dl_tensor.device.device_type == kDLCPU) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = platform::CUDAPlace(dl_tensor.device.device_id); @@ -849,7 +855,8 @@ void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) { void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (src->dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = platform::CUDAPlace(src->dl_tensor.device.device_id); diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 36a3e968251c9..c9ec8f0c34d79 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -129,7 +129,8 @@ void TensorFromArray(const T* src, if (platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -175,7 +176,8 @@ void TensorFromVector(const std::vector& src, if (platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -304,7 +306,8 @@ void TensorToVector(const phi::DenseTensor& src, if (platform::is_cpu_place(src.place())) { memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -346,7 +349,8 @@ inline void TensorToVector(const phi::DenseTensor& src, if (platform::is_cpu_place(src.place())) { memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index bda2681f57f31..638114df3d2da 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -58,7 +58,8 @@ TEST(TensorCopy, Tensor) { } EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { phi::DenseTensor src_tensor; phi::DenseTensor gpu_tensor; @@ -153,7 +154,8 @@ TEST(TensorFromVector, Tensor) { delete cpu_place; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; phi::DenseTensor cpu_tensor; @@ -232,7 +234,8 @@ TEST(TensorToVector, Tensor) { EXPECT_EQ(src_ptr[i], dst[i]); } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; phi::DenseTensor gpu_tensor; @@ -323,7 +326,8 @@ TEST(TensorFromDLPack, Tensor) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; phi::DenseTensor cpu_tensor; @@ -489,7 +493,8 @@ TEST(Tensor, FromAndToStream) { EXPECT_EQ(dst_tensor.dims(), src_tensor.dims()); delete place; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { phi::DenseTensor gpu_tensor; gpu_tensor.Resize({2, 3}); diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index bf69bed9d4851..4d9b39a77ec04 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -172,7 +172,8 @@ class HeterServiceContext { int place_num_; Scope* scope_{nullptr}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gpuEvent_t event_; #endif std::vector ops_; @@ -204,7 +205,8 @@ class HeterXpuTrainer : public TrainerBase { virtual std::string GetDumpPath(int tid) { return ""; } virtual void InitDumpEnv() {} template -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void HeterMemCpy(phi::DenseTensor* tensor, phi::DenseTensor* root_tensor, const paddle::platform::Place& thread_place, @@ -242,7 +244,8 @@ class HeterXpuTrainer : public TrainerBase { std::vector place_scopes_; BtObjectPool object_pool_; std::vector places_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::vector copy_streams_; std::vector events_; #endif diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index d73c9b7d95957..0b289b8a6ddff 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -37,6 +37,13 @@ #include "paddle/fluid/operators/miopen_rnn_cache.h" #endif +#ifdef PADDLE_WITH_MUSA +#if defined(PADDLE_WITH_MCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT +#endif +#endif + #if defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #endif diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 2e188e6caa076..b62347b1561bf 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -33,6 +33,12 @@ #include #endif #endif +#ifdef PADDLE_WITH_MUSA +#include +#if defined(PADDLE_WITH_MCCL) +#include +#endif +#endif #ifdef PADDLE_WITH_HIP #include #ifdef PADDLE_WITH_RCCL @@ -59,7 +65,8 @@ class SparseCsrTensor; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class Communicator; class NCCLCommunicator; @@ -189,14 +196,18 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< FetchList, FeedList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_MCCL) ncclUniqueId, platform::Communicator, platform::NCCLCommunicator, #endif +#ifndef PADDLE_WITH_MUSA operators::CudnnRNNCache, #endif +#endif #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, platform::BKCLCommunicator, diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index be715a2a451ad..da93b60b4a280 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -138,7 +138,8 @@ AmpOperators::AmpOperators() block_ops_(new std::unordered_set()), unsupported_fp16_ops_(new std::unordered_set()), unsupported_bf16_ops_(new std::unordered_set()) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto unsupported_ops_gpu_fp16 = std::get<2>( OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16)); unsupported_fp16_ops_->insert(unsupported_ops_gpu_fp16.begin(), diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 14b9bc5aae0bc..7199762e0c5ac 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -204,7 +204,8 @@ void TensorAdd(const VarType& src, VarType* dst) { } if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PADDLE_TENSOR_ADD(float, phi::GPUContext); PADDLE_TENSOR_ADD(double, phi::GPUContext); PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext); @@ -313,7 +314,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { return; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double); @@ -321,7 +323,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { #endif PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } #endif @@ -364,7 +367,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, return; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double); @@ -372,7 +376,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, #endif PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } #endif @@ -425,7 +430,8 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, return dst_var; \ } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float); PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double); @@ -441,7 +447,8 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, #if defined(PADDLE_WITH_XPU) } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } #endif @@ -712,7 +719,8 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { // sum selected rows firstly for (auto& var_info : tmp_grad_vars_) { @@ -778,7 +786,8 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, // Increase count IncreaseCurCnt(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } #endif tmp_grad_vars_.clear(); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index cda2fad5d7436..206c3e562e70a 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -205,7 +205,8 @@ PreparedOp PrepareImpl( } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (op.CanCUDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.dtype())) { expected_kernel_key.set_backend(phi::Backend::GPUDNN); } @@ -555,7 +556,8 @@ static void PreparedOpRunImpl( if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif @@ -645,7 +647,8 @@ static void PreparedOpRunPtImpl( if (FLAGS_benchmark) { dev_ctx->Wait(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; #endif diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index ccb58d320221c..2c0669aa12883 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -106,7 +106,8 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( if (gcs_.count(place) == 0) { std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gc.reset(new framework::DefaultStreamGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; @@ -116,7 +117,8 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( "Please recompile or reinstall Paddle with GPU support.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0)); VLOG(10) << "Created GarbageCollector at " << place; @@ -274,7 +276,8 @@ void Tracer::TraceOpImpl(const std::string& type, try { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) platform::SetDeviceId(place.device); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 65e149925e742..4777082196771 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -36,7 +36,8 @@ namespace paddle { namespace inference { namespace analysis { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; @@ -209,7 +210,8 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { argument->scope_valid(), true, platform::errors::PreconditionNotMet("The scope field should be valid")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (argument->use_gpu_valid()) { CopyParamsToGpu(argument); } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index ee29af1c13308..86f8a12539809 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -32,7 +32,8 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { std::string repr() const override; private: -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void CopyParamsToGpu(Argument *argument); #endif diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 25c7e7e2a03d4..902034a9bd899 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -32,7 +32,8 @@ #include "paddle/fluid/inference/tensorrt/helper.h" #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PHI_DECLARE_uint64(initial_gpu_memory_in_mb); #endif @@ -100,7 +101,8 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id, Precision precision_mode) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) use_gpu_ = true; memory_pool_init_size_mb_ = memory_pool_init_size_mb; FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_; @@ -630,7 +632,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { } void AnalysisConfig::EnableCUDNN() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) use_cudnn_ = use_gpu_; #else LOG(ERROR) << "Please compile with CUDA first to use cuDNN"; @@ -928,7 +931,8 @@ void AnalysisConfig::Update() { } if (use_gpu() && use_cudnn_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (!enable_ir_optim_) { LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled."; } else { @@ -1145,7 +1149,8 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads( } float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // Get the GPU memory details and calculate the fraction of memory for the // GPU memory pool. size_t gpu_total, gpu_available; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 56652c2f42cb7..d6c535c591cda 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -99,7 +99,8 @@ namespace paddle { namespace { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void UpdatePrivateDeviceContext(InferGPUContext *gpu_context, GPUContextResource *gpu_resource, Place place_) { @@ -131,7 +132,9 @@ void UpdatePrivateDeviceContext(InferGPUContext *gpu_context, gpu_context->SetBlasTF32Handle( gpu_resource->GetBlasTF32TensorCoreHandleCreator()); gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator()); +#ifndef PADDLE_WITH_MUSA gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator()); +#endif gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator()); gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice()); @@ -270,7 +273,8 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt, false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place)); auto dst_gpu_place = place; @@ -370,7 +374,8 @@ bool AnalysisPredictor::Init( return true; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // TODO(inference): Now only gpu with external stream support private // device_context. if (config_.use_gpu_ && config_.use_external_stream_) { @@ -418,7 +423,8 @@ void AnalysisPredictor::InitPlace() { platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (config_.thread_local_stream_enabled()) { LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. " "Please use config.SetExecStream instead."; @@ -489,14 +495,16 @@ void AnalysisPredictor::InitPlace() { } void AnalysisPredictor::InitResourceManager(void *stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) predictor_stream_ = ResourceManager::Instance().InitGPUResource(place_, stream); #endif } void AnalysisPredictor::InitDeviceContexts() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // Init GPUContext. if (place_.GetType() == phi::AllocationType::GPU) { device_contexts_.emplace( @@ -534,7 +542,8 @@ void AnalysisPredictor::InitDeviceContexts() { } void *AnalysisPredictor::GetExecStream() const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (place_.GetType() == phi::AllocationType::GPU) { if (private_context_) { return predictor_stream_; @@ -2151,7 +2160,8 @@ bool AnalysisPredictor::ZeroCopyRun() { return true; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { if (!private_context_) { PADDLE_THROW(platform::errors::Fatal( @@ -2162,6 +2172,8 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { if (stream != predictor_stream_) { #ifdef PADDLE_WITH_HIP hipStreamSynchronize(static_cast(predictor_stream_)); +#elif defined(PADDLE_WITH_MUSA) + musaStreamSynchronize(static_cast(predictor_stream_)); #else cudaStreamSynchronize(static_cast(predictor_stream_)); #endif @@ -2199,11 +2211,14 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() { paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); if (config_.use_gpu()) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto *dev_ctx = pool.Get(place_); auto stream = static_cast(dev_ctx)->stream(); #ifdef PADDLE_WITH_HIP hipStreamSynchronize(stream); +#elif defined(PADDLE_WITH_MUSA) + musaStreamSynchronize(stream); #else cudaStreamSynchronize(stream); #endif @@ -2595,7 +2610,8 @@ AnalysisPredictor::~AnalysisPredictor() { if (config_.shape_range_info_collected()) { StatisticShapeRangeInfo(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (predictor_stream_ != nullptr) { ResourceManager::Instance().DestroyGPUResource(predictor_stream_); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index bde6ca48741ad..547cf0d2284be 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -220,7 +220,8 @@ class AnalysisPredictor : public PaddlePredictor { /// bool ZeroCopyRun() override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // Note: Can only be used under thread_local semantics. bool ExpRunWithExternalStream(const gpuStream_t stream); #endif diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 28353150c265c..faf3cedce947d 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -250,7 +250,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, false, platform::errors::InvalidArgument( "Only one choice can be made between CPU and XPU.")); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place_)); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 37ee2b4df643d..dd219f2c59fd5 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -108,7 +108,8 @@ T *Tensor::mutable_data(PlaceType place) { return tensor->mutable_data(paddle::platform::CPUPlace()); } case static_cast(PlaceType::kGPU): { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) paddle::platform::CUDAPlace gpu_place(device_); auto *dev_ctxs = reinterpret_castmutable_data(paddle::platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); } else if (place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) paddle::platform::CUDAPlace gpu_place(device_); auto *dev_ctxs = reinterpret_caststream()); #ifdef PADDLE_WITH_HIP hipStreamSynchronize(dev_ctx->stream()); +#elif defined(PADDLE_WITH_MUSA) + musaStreamSynchronize(dev_ctx->stream()); #else // async, return stream if (nullptr != exec_stream) { @@ -821,7 +826,8 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t, auto *t_data = tensor->mutable_data(paddle::platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); } else if (t->place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) paddle::platform::CUDAPlace gpu_place(t->device_); auto *t_data = tensor->mutable_data(gpu_place); paddle::memory::Copy(gpu_place, @@ -891,7 +897,8 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); #endif } else if (t->place_ == PlaceType::kGPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) paddle::memory::Copy(paddle::platform::CPUPlace(), static_cast(data), t_place, diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc index 533363f1b25da..b2abb21602dc9 100644 --- a/paddle/fluid/inference/api/infer_context.cc +++ b/paddle/fluid/inference/api/infer_context.cc @@ -21,7 +21,8 @@ namespace paddle { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) InferGPUContext::InferGPUContext(const phi::Place& place) : phi::GPUContext(place, false) {} #endif diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h index 2b5c4e974eb08..eef3d31a5c493 100644 --- a/paddle/fluid/inference/api/infer_context.h +++ b/paddle/fluid/inference/api/infer_context.h @@ -26,7 +26,8 @@ class InferCPUContext : public phi::CPUContext { using phi::CPUContext::SetEigenDevice; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) class InferGPUContext : public phi::GPUContext { public: explicit InferGPUContext(const phi::Place& place); @@ -35,7 +36,9 @@ class InferGPUContext : public phi::GPUContext { using phi::GPUContext::SetBlasTF32Handle; using phi::GPUContext::SetDnnHandle; using phi::GPUContext::SetEigenDevice; +#ifndef PADDLE_WITH_MUSA using phi::GPUContext::SetSolverHandle; +#endif using phi::GPUContext::SetSparseHandle; using phi::GPUContext::SetStream; // using phi::GPUContext::SetDnnWorkspaceHandle; diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc index 3f06ee5722af9..a13fa97b69185 100644 --- a/paddle/fluid/inference/api/resource_manager.cc +++ b/paddle/fluid/inference/api/resource_manager.cc @@ -44,7 +44,8 @@ namespace paddle { namespace internal { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) class EigenGpuStreamDevice : public Eigen::StreamInterface { public: EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { @@ -102,6 +103,9 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_)); @@ -132,7 +136,8 @@ void CPUContextResource::InitCPUResource() { CPUContextResource::CPUContextResource() { InitCPUResource(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) GPUContextResource::GPUContextResource(const phi::Place& place, void* stream) : place_(place) { InitGPUResource(stream); @@ -158,6 +163,8 @@ void GPUContextResource::DestroyGPUResource() { if (owned_stream_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_)); #endif @@ -166,8 +173,10 @@ void GPUContextResource::DestroyGPUResource() { DestroyDnnHandle(); DestroyBlasHandle(); +#ifndef PADDLE_WITH_MUSA DestroyBlasLtHandle(); DestroySolverHandle(); +#endif DestroySparseHandle(); } @@ -204,7 +213,7 @@ void GPUContextResource::DestroyBlasHandle() { phi::DestroyBlasHandle(blas_tensor_core_handle_); phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_); } - +#ifndef PADDLE_WITH_MUSA void GPUContextResource::InitBlasLtHandle() { phi::InitBlasLtHandle(&blaslt_handle_); } @@ -220,6 +229,7 @@ void GPUContextResource::InitSolverHandle() { void GPUContextResource::DestroySolverHandle() { phi::DestroySolverHandle(solver_handle_); } +#endif void GPUContextResource::InitSparseHandle() { phi::InitSparseHandle(&sparse_handle_, stream_); @@ -287,6 +297,7 @@ GPUContextResource::GetBlasTF32TensorCoreHandleCreator() { }; } +#ifndef PADDLE_WITH_MUSA blasLtHandle_t GPUContextResource::GetBlasLtHandle() const { return blaslt_handle_; } @@ -310,6 +321,7 @@ GPUContextResource::GetSolverDnHandleCreator() { return solver_handle_; }; } +#endif phi::sparseHandle_t GPUContextResource::GetSparseHandle() const { return sparse_handle_; @@ -375,7 +387,8 @@ CPUContextResource* ResourceManager::GetCPUResource() const { return cpu_resource_.get(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) { std::lock_guard lock_gurad(gpu_mutex_); if (gpu_resources_.count(stream)) { diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h index e14de1c2ffc86..36841a46c4878 100644 --- a/paddle/fluid/inference/api/resource_manager.h +++ b/paddle/fluid/inference/api/resource_manager.h @@ -25,7 +25,8 @@ #include "paddle/phi/common/place.h" #include "unsupported/Eigen/CXX11/Tensor" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" @@ -49,7 +50,8 @@ class CPUContextResource { std::unique_ptr cpu_eigen_device_; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) class GPUContextResource { public: explicit GPUContextResource(const phi::Place& place, void* stream); @@ -60,8 +62,10 @@ class GPUContextResource { std::function GetBlasHandleCreator(); std::function GetBlasTensorCoreHandleCreator(); std::function GetBlasTF32TensorCoreHandleCreator(); +#ifndef PADDLE_WITH_MUSA std::function GetBlasLtHandleCreator(); std::function GetSolverDnHandleCreator(); +#endif std::function GetSparseHandleCreator(); std::function GetGpuEigenDeviceCreator(); @@ -70,8 +74,10 @@ class GPUContextResource { blasHandle_t GetBlasHandle() const; blasHandle_t GetBlasTensorCoreHandle() const; blasHandle_t GetBlasTF32Handle() const; +#ifndef PADDLE_WITH_MUSA blasLtHandle_t GetBlasLtHandle() const; phi::solverHandle_t GetSolverDnHandle() const; +#endif phi::sparseHandle_t GetSparseHandle() const; Eigen::GpuDevice* GetGpuEigenDevice() const; int GetGpuComputeCapability() const; @@ -90,10 +96,12 @@ class GPUContextResource { void InitDnnHanlde(); void DestroyDnnHandle(); void DestroyBlasHandle(); +#ifndef PADDLE_WITH_MUSA void InitBlasLtHandle(); void DestroyBlasLtHandle(); void InitSolverHandle(); void DestroySolverHandle(); +#endif void InitSparseHandle(); void DestroySparseHandle(); @@ -116,9 +124,11 @@ class GPUContextResource { blasHandle_t blas_handle_{nullptr}; blasHandle_t blas_tensor_core_handle_{nullptr}; blasHandle_t blas_tf32_tensor_core_handle_{nullptr}; +#ifndef PADDLE_WITH_MUSA blasLtHandle_t blaslt_handle_{nullptr}; - dnnHandle_t dnn_handle_{nullptr}; phi::solverHandle_t solver_handle_{nullptr}; +#endif + dnnHandle_t dnn_handle_{nullptr}; phi::sparseHandle_t sparse_handle_{nullptr}; // DnnWorkspaceHandle }; @@ -141,7 +151,8 @@ class ResourceManager { std::mutex cpu_mutex_; std::unique_ptr cpu_resource_{nullptr}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // GPU Resource public: void* InitGPUResource(const phi::Place& place, void* stream); diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 6de5f9cfa0ca1..edf34de39f4e6 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -127,7 +127,8 @@ void MemoryCopyAsync(const platform::Place& dst_place, if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) { memory::Copy(cpu_place, dst_data, cpu_place, src_data, size); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_cpu_place(dst_place) && platform::is_gpu_place(src_place)) { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 2500f624967c6..3c8f0694ee774 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -218,6 +218,9 @@ void QkvToContextPluginDynamic::configurePlugin( #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream())); diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 1a39590398911..c044d25053ba3 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -19,7 +19,9 @@ set(ALLOCATOR_SRCS buddy_allocator.cc system_allocator.cc) -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU + OR WITH_ROCM + OR WITH_MUSA) list( APPEND ALLOCATOR_SRCS @@ -89,6 +91,10 @@ if(WITH_ROCM) SRCS thread_local_allocator_test.cc DEPS allocator) endif() +if(WITH_MUSA) + musa_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc + DEPS allocator) +endif() if(WITH_GPU) nv_test( @@ -100,6 +106,15 @@ elseif(WITH_ROCM) best_fit_allocator_test SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu DEPS allocator memcpy) +elseif(WITH_MUSA) + musa_test( + best_fit_allocator_test + SRCS + best_fit_allocator_test.cc + best_fit_allocator_test.cu + DEPS + allocator + memcpy) else() cc_test_old(best_fit_allocator_test SRCS best_fit_allocator_test.cc DEPS allocator) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 07e55115ba130..6ba81821871f8 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -27,7 +27,8 @@ #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/macros.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include #include "paddle/fluid/memory/allocation/cuda_allocator.h" @@ -164,7 +165,8 @@ class AllocatorFacadePrivate { public: using AllocatorMap = std::map>; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) using CUDAAllocatorMap = std::map>>; @@ -187,7 +189,8 @@ class AllocatorFacadePrivate { InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } @@ -214,7 +217,8 @@ class AllocatorFacadePrivate { case AllocatorStrategy::kAutoGrowth: { InitNaiveBestFitCPUAllocator(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) allow_free_idle_chunk_ = allow_free_idle_chunk; for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), @@ -286,7 +290,8 @@ class AllocatorFacadePrivate { InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); } @@ -345,7 +350,8 @@ class AllocatorFacadePrivate { LIKELY(FLAGS_use_system_allocator == false); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) { auto it = cuda_allocators_.find(place); if (it == cuda_allocators_.end()) { @@ -594,7 +600,8 @@ class AllocatorFacadePrivate { #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void InitNaiveBestFitCUDAPinnedAllocator() { allocators_[platform::CUDAPinnedPlace()] = std::make_shared(platform::CUDAPinnedPlace()); @@ -655,7 +662,7 @@ class AllocatorFacadePrivate { auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is " << FLAGS_auto_growth_chunk_size_in_mb; -#if defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto cuda_allocator = CreateCUDAAllocator(p); cuda_allocators_[p][stream] = std::make_shared( cuda_allocator, @@ -741,7 +748,7 @@ class AllocatorFacadePrivate { auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is " << FLAGS_auto_growth_chunk_size_in_mb; -#if defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) auto cuda_allocator = CreateCUDAAllocator(p); allocators_[p] = std::make_shared( cuda_allocator, @@ -1038,7 +1045,8 @@ class AllocatorFacadePrivate { system_allocators_[p] = std::make_shared(p); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) system_allocators_[platform::CUDAPinnedPlace()] = std::make_shared(); int device_count = platform::GetGPUDeviceCount(); @@ -1064,7 +1072,8 @@ class AllocatorFacadePrivate { if (!zero_size_allocators_.empty()) return; std::vector places; places.emplace_back(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) int device_count = platform::GetGPUDeviceCount(); for (int dev_id = 0; dev_id < device_count; ++dev_id) { places.emplace_back(platform::CUDAPlace(dev_id)); @@ -1112,7 +1121,8 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(allocators_); CheckAllocThreadSafe(zero_size_allocators_); CheckAllocThreadSafe(system_allocators_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (is_stream_safe_cuda_allocator_used_) { CheckCUDAAllocThreadSafe(cuda_allocators_); } @@ -1145,7 +1155,8 @@ class AllocatorFacadePrivate { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // a standalone CUDA allocator to support multi-stream GC in new executor std::map> default_stream_safe_cuda_allocators_; @@ -1252,7 +1263,8 @@ std::shared_ptr AllocatorFacade::AllocShared( AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, const phi::Stream& stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) AllocatorFacadePrivate* m = GetPrivate(); if (!m->IsStreamSafeCUDAAllocatorUsed()) { VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!"; @@ -1278,7 +1290,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, bool AllocatorFacade::InSameStream( const std::shared_ptr& allocation, const phi::Stream& stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gpuStream_t s = reinterpret_cast(stream.id()); return s == GetStream(allocation); #else @@ -1290,7 +1303,8 @@ bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() { return GetPrivate()->IsStreamSafeCUDAAllocatorUsed(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, gpuStream_t stream) { AllocatorFacadePrivate* m = GetPrivate(); diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index a1f21a5e69359..92bbc03378be2 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -76,7 +76,8 @@ class AllocatorFacade { bool IsStreamSafeCUDAAllocatorUsed(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream); void RecordStream(std::shared_ptr allocation, gpuStream_t stream); diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc index 1e09c43c4f12f..fe905932c626b 100644 --- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc +++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc @@ -17,7 +17,8 @@ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PHI_DECLARE_double(fraction_of_gpu_memory_to_use); PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use); PHI_DECLARE_uint64(initial_gpu_memory_in_mb); @@ -46,7 +47,8 @@ void AllocateTestCases() { ASSERT_EQ(cpu_allocation->size(), size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { place = platform::CUDAPlace(0); size = 1024; @@ -82,7 +84,8 @@ void AllocateTestCases() { } TEST(Allocator, SpecifyGpuMemory) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and // FLAGS_reallocate_gpu_memory_in_mb FLAGS_fraction_of_gpu_memory_to_use = 0.0; diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc index 63e3eab3256c9..b88c952243a06 100644 --- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc +++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc @@ -17,7 +17,8 @@ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PHI_DECLARE_double(fraction_of_gpu_memory_to_use); PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use); PHI_DECLARE_uint64(initial_gpu_memory_in_mb); @@ -46,7 +47,8 @@ void AllocateTestCases() { ASSERT_EQ(cpu_allocation->size(), size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { place = platform::CUDAPlace(0); size = 1024; @@ -82,7 +84,8 @@ void AllocateTestCases() { } TEST(Allocator, Allocator) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) FLAGS_fraction_of_gpu_memory_to_use = 0.01; FLAGS_gpu_allocator_retry_time = 500; FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc index bfd05b6b323fe..47a4be778819b 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc @@ -23,7 +23,8 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PHI_DECLARE_double(fraction_of_gpu_memory_to_use); PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use); DECLARE_int64(gpu_allocator_retry_time); @@ -41,7 +42,8 @@ static inline size_t AlignTo(size_t size, size_t alignment) { } TEST(allocator, allocator) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) FLAGS_fraction_of_gpu_memory_to_use = 0.01; FLAGS_gpu_allocator_retry_time = 500; FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; @@ -102,7 +104,8 @@ TEST(allocator, allocator) { TEST(multithread_allocate, test_segfault) { FLAGS_allocator_strategy = "auto_growth"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::mutex mtx; std::condition_variable cv; bool flag = false; diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc index 8de464754cb35..8089f21a3619f 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator.cc @@ -19,7 +19,8 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #define USE_DEVICE PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif @@ -53,7 +54,8 @@ BuddyAllocator::BuddyAllocator( platform::PlaceHelper::CreatePlace(dev_type)); }; } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) init_allocate_size_func_ = &platform::GpuInitAllocSize; re_allocate_size_func_ = &platform::GpuReallocSize; #endif @@ -249,7 +251,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( allocate_bytes = DeviceAllocateSize( init_allocate_size_func_, re_allocate_size_func_, request_bytes); #else -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) allocate_bytes = DeviceAllocateSize( &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes); #endif diff --git a/paddle/fluid/memory/allocation/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc index 1aeb1722d0ec8..e74544e292306 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc @@ -26,7 +26,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PHI_DECLARE_double(fraction_of_gpu_memory_to_use); PHI_DECLARE_uint64(initial_gpu_memory_in_mb); PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb); @@ -77,7 +78,8 @@ int* TestBuddyAllocator(BuddyAllocator* allocator, return nullptr; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) TEST(BuddyAllocator, GpuFraction) { // In a 16 GB machine, the pool size will be about 160 MB FLAGS_fraction_of_gpu_memory_to_use = 0.01; @@ -244,6 +246,9 @@ TEST(BuddyAllocator, AllocFromAvailable) { #ifdef PADDLE_WITH_HIP hipError_t result = hipMalloc(&p, available >> 1); EXPECT_TRUE(result == hipSuccess); +#elif defined(PADDLE_WITH_MUSA) + musaError_t result = musaMalloc(&p, available >> 1); + EXPECT_TRUE(result == musaSuccess); #else cudaError_t result = cudaMalloc(&p, available >> 1); EXPECT_TRUE(result == cudaSuccess); @@ -263,6 +268,8 @@ TEST(BuddyAllocator, AllocFromAvailable) { if (p) { #ifdef PADDLE_WITH_HIP EXPECT_TRUE(hipFree(p) == hipSuccess); +#elif defined(PADDLE_WITH_MUSA) + EXPECT_TRUE(musaFree(p) == musaSuccess); #else EXPECT_TRUE(cudaFree(p) == cudaSuccess); #endif @@ -278,6 +285,8 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) { #ifdef PADDLE_WITH_HIP EXPECT_TRUE(hipMalloc(&p, static_cast(1) << 30) == hipSuccess); +#elif defined(PADDLE_WITH_MUSA) + EXPECT_TRUE(musaMalloc(&p, static_cast(1) << 30) == musaSuccess); #else EXPECT_TRUE(cudaMalloc(&p, static_cast(1) << 30) == cudaSuccess); #endif @@ -294,6 +303,8 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) { if (p) { #ifdef PADDLE_WITH_HIP EXPECT_TRUE(hipFree(p) == hipSuccess); +#elif defined(PADDLE_WITH_MUSA) + EXPECT_TRUE(musaFree(p) == musaSuccess); #else EXPECT_TRUE(cudaFree(p) == cudaSuccess); #endif diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 781addd7dba60..51e6c88d55d50 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -19,6 +19,11 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 7286f84160c6a..42e6f7be8de31 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -82,6 +82,9 @@ class GPUContextAllocator : public Allocator { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreate(&event_, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreate(&event_, cudaEventDisableTiming)); @@ -92,8 +95,9 @@ class GPUContextAllocator : public Allocator { if (event_) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP - PADDLE_WARN_GPU_SUCCESS(hipEventDestroy(event_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_WARN_GPU_SUCCESS(musaEventDestroy(event_)); #else PADDLE_WARN_GPU_SUCCESS(cudaEventDestroy(event_)); #endif @@ -113,6 +117,9 @@ class GPUContextAllocator : public Allocator { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, default_stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(default_stream_, event_, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0)); diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc index 06e9fbe88827b..d1b68212736ee 100644 --- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc @@ -19,6 +19,11 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.h b/paddle/fluid/memory/allocation/cuda_managed_allocator.h index a01e1c58d439b..3fdcfb8038086 100644 --- a/paddle/fluid/memory/allocation/cuda_managed_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index e436e6c439081..e4b0273a6efc3 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -26,7 +26,8 @@ #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/common/place.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/platform/flags.h" @@ -213,7 +214,8 @@ size_t Used(const platform::XPUPlace &place) { } // For CUDA -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) class GPUBuddyAllocatorList { private: GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) { @@ -283,7 +285,8 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { template <> size_t Used(const platform::CUDAPlace &place) { -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ + defined PADDLE_WITH_MUSA) return GetGPUBuddyAllocator(place.device)->Used(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -294,7 +297,8 @@ size_t Used(const platform::CUDAPlace &place) { template <> void *Alloc(const platform::CUDAPlace &place, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { @@ -315,6 +319,8 @@ void *Alloc(const platform::CUDAPlace &place, if (FLAGS_init_allocated_mem) { #ifdef PADDLE_WITH_HIP hipMemset(ptr, 0xEF, size); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(ptr, 0xEF, size); #else cudaMemset(ptr, 0xEF, size); #endif @@ -331,7 +337,8 @@ template <> void Free(const platform::CUDAPlace &place, void *p, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) GetGPUBuddyAllocator(place.device)->Free(p); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -341,7 +348,8 @@ void Free(const platform::CUDAPlace &place, template <> uint64_t Release(const platform::CUDAPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) return GetGPUBuddyAllocator(place.device)->Release(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -349,7 +357,8 @@ uint64_t Release(const platform::CUDAPlace &place) { #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) BuddyAllocator *GetCUDAPinnedBuddyAllocator() { static std::once_flag init_flag; static BuddyAllocator *ba = nullptr; @@ -367,7 +376,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() { template <> size_t Used(const platform::CUDAPinnedPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) return GetCUDAPinnedBuddyAllocator()->Used(); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -378,7 +388,8 @@ size_t Used(const platform::CUDAPinnedPlace &place) { template <> void *Alloc(const platform::CUDAPinnedPlace &place, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); auto *buddy_allocator = GetCUDAPinnedBuddyAllocator(); void *ptr = buddy_allocator->Alloc(size); @@ -401,7 +412,8 @@ template <> void Free(const platform::CUDAPinnedPlace &place, void *p, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) VLOG(10) << "Free " << size << " bytes on " << platform::Place(place); GetCUDAPinnedBuddyAllocator()->Free(p); #else @@ -413,7 +425,8 @@ void Free(const platform::CUDAPinnedPlace &place, template <> uint64_t Release( const platform::CUDAPinnedPlace &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) VLOG(10) << "Release on " << platform::Place(place); return GetCUDAPinnedBuddyAllocator()->Release(); #else @@ -602,7 +615,8 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const { } size_t Usage::operator()(const platform::CUDAPlace &gpu) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) return Used(gpu); #else PADDLE_THROW(platform::errors::PermissionDenied( @@ -611,7 +625,8 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const { } size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) return Used(cuda_pinned); #else PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc index 37da748ee9c96..b6be358fde05c 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc @@ -33,7 +33,8 @@ TEST(NaiveBestFitAllocatorTest, CpuAlloc) { alloc.Release(platform::CPUPlace()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) TEST(NaiveBestFitAllocatorTest, GpuAlloc) { NaiveBestFitAllocator alloc{platform::CUDAPlace(0)}; { diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index f1c0178fafc02..567ec4e4c9461 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -23,6 +23,8 @@ bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaFreeHost(allocation->ptr())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); #endif @@ -37,6 +39,8 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { void *ptr; #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaHostAlloc(&ptr, size, musaHostAllocPortable)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); #endif diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index d1872ee00b7b7..115fa600ad972 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -19,7 +19,8 @@ #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/memory/allocation/cuda_allocator.h" #endif @@ -114,7 +115,8 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) { platform::CUDAPlace p(0); RetryAllocator allocator(std::make_shared(p), retry_ms); diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 9f513448eea26..ae9738ee2afd8 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -86,6 +86,16 @@ bool StreamSafeCUDAAllocation::CanBeFreed() { } PADDLE_ENFORCE_GPU_SUCCESS(err); PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); +#elif defined(PADDLE_WITH_MUSA) + gpuError_t err = musaEventQuery(event); + if (err == musaErrorNotReady) { + VLOG(9) << "Event " << event << " for " << ptr() << " is not completed"; + // Erase the completded event before "it" + outstanding_event_map_.erase(outstanding_event_map_.begin(), it); + return false; + } + PADDLE_ENFORCE_GPU_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else gpuError_t err = hipEventQuery(event); if (err == hipErrorNotReady) { @@ -122,6 +132,9 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&new_event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); @@ -136,6 +149,8 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(record_event, stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); #endif diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 08ecdd4969730..efa0e8393aa20 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -16,6 +16,7 @@ #include #include +#include #include #include "paddle/fluid/memory/allocation/allocator.h" @@ -24,6 +25,8 @@ #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include #else #include #endif diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc index 210be01669775..0ef6b35f8cdac 100644 --- a/paddle/fluid/memory/allocation/system_allocator.cc +++ b/paddle/fluid/memory/allocation/system_allocator.cc @@ -33,7 +33,8 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_info.h" #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -120,7 +121,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { bool CPUAllocator::UseGpu() const { return false; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void* GPUAllocator::Alloc(size_t* index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr @@ -216,6 +218,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { // PINNED memory is visible to all CUDA contexts. #ifdef PADDLE_WITH_HIP hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable); +#elif defined(PADDLE_WITH_MUSA) + musaError_t result = musaHostAlloc(&p, size, musaHostAllocPortable); #else cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); #endif @@ -259,6 +263,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { platform::errors::Fatal( "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err)); } +#elif defined(PADDLE_WITH_MUSA) + err = musaFreeHost(p); + if (err != musaErrorMusartUnloading) { + PADDLE_ENFORCE_EQ( + err, + 0, + platform::errors::Fatal( + "musaFreeHost failed in GPUPinnedAllocator, error code is %d", + err)); + } #else err = cudaFreeHost(p); diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h index 67376a3e39a22..cde4743e620a9 100644 --- a/paddle/fluid/memory/allocation/system_allocator.h +++ b/paddle/fluid/memory/allocation/system_allocator.h @@ -43,7 +43,8 @@ class CPUAllocator : public SystemAllocator { virtual bool UseGpu() const; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) class GPUAllocator : public SystemAllocator { public: explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} diff --git a/paddle/fluid/memory/allocation/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc index e04d14f0adfde..d6a203ef38f47 100644 --- a/paddle/fluid/memory/allocation/system_allocator_test.cc +++ b/paddle/fluid/memory/allocation/system_allocator_test.cc @@ -57,7 +57,8 @@ TEST(CPUAllocator, LockMem) { TestAllocator(&a, 0); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) TEST(GPUAllocator, Alloc) { paddle::memory::detail::GPUAllocator a(0); TestAllocator(&a, 2048); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 46f9b1189cb68..01220b0e44240 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -57,7 +57,8 @@ void* GetBasePtr(const std::shared_ptr& allocation) { return allocation::AllocatorFacade::Instance().GetBasePtr(allocation); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index b8f5f0289c4bc..2e029c4ebae88 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -48,7 +48,8 @@ extern bool InSameStream(const std::shared_ptr& allocation, extern void* GetBasePtr(const std::shared_ptr& allocation); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) extern uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream); void RecordStream(std::shared_ptr allocation, gpuStream_t stream); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 4a56a01e640bf..c8ce60e7c39d6 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -256,7 +256,8 @@ void Copy(phi::Place dst_place, #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K #ifdef PADDLE_WITH_HIP @@ -271,6 +272,18 @@ inline void SyncCUDAStream() { } #endif } +#elif defined(PADDLE_WITH_MUSA) +inline void SyncCUDAStream() { +#if !defined(_WIN32) + musaStreamSynchronize(0); +#else + musaError_t e_sync = musaSuccess; + while (e_sync = musaStreamQuery(0)) { + if (e_sync == musaErrorNotReady) continue; + break; + } +#endif +} #else inline void SyncCUDAStream() { #if !defined(_WIN32) @@ -313,6 +326,12 @@ void Copy( num, hipMemcpyDeviceToHost, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyDeviceToHost, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -325,6 +344,8 @@ void Copy( "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); #endif @@ -357,6 +378,12 @@ void Copy( num, hipMemcpyHostToDevice, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyHostToDevice, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -369,6 +396,8 @@ void Copy( "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); #endif @@ -403,6 +432,12 @@ void Copy( num, hipMemcpyDeviceToDevice, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyDeviceToDevice, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -416,6 +451,8 @@ void Copy( 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); #endif @@ -502,6 +539,12 @@ void Copy( num, hipMemcpyDeviceToHost, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyDeviceToHost, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -515,6 +558,8 @@ void Copy( 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); #endif @@ -544,6 +589,12 @@ void Copy( num, hipMemcpyHostToDevice, reinterpret_cast(stream)); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpyAsync(dst, + src, + num, + musaMemcpyHostToDevice, + reinterpret_cast(stream)); #else platform::GpuMemcpyAsync(dst, src, @@ -557,6 +608,8 @@ void Copy( 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice); #else platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); #endif @@ -746,7 +799,8 @@ void Copy(phi::Place dst_place, dst_place.GetType() == phi::AllocationType::CPU) { std::memcpy(dst, src, num); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT dst_place.GetType() == phi::AllocationType::GPUPINNED) { std::memcpy(dst, src, num); diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc index 081f0d3d78c13..e51859e791a08 100644 --- a/paddle/fluid/memory/memory_stats_test.cc +++ b/paddle/fluid/memory/memory_stats_test.cc @@ -40,7 +40,8 @@ TEST(stat_allocator_test, host_memory_stat_test) { EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) TEST(stat_allocator_test, device_memory_stat_test) { std::vector alloc_sizes{ 5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527, diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index 6ec8d77da2c85..7adcc1e09c24a 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -16,6 +16,10 @@ limitations under the License. */ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include +#endif + #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index d1dc7d8986bec..8cdb244d8af6f 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -55,7 +55,8 @@ struct ArrayToLoDFunctor { if (std::is_same::value) { Apply(static_cast(pool.Get(place))); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) Apply(static_cast(pool.Get(place))); #else PADDLE_THROW( diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index 012edde57294a..1272a83b2b147 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -19,6 +19,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index f63baadbde526..2c4b4f1ceacf6 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -19,6 +19,11 @@ #include typedef hiprandState curandState; namespace cub = hipcub; + +#elif defined(PADDLE_WITH_MUSA) +#include +#include +#include #else #include #include @@ -72,6 +77,11 @@ __global__ void RandomSampleClassCenter(const int64_t n, CUDA_KERNEL_LOOP(i, n) { buffer[i] = static_cast(hiprand(&localState) % max_val); } +#elif defined(PADDLE_WITH_MUSA) + murand_init(local_seed, id, increment, &localState); + CUDA_KERNEL_LOOP(i, n) { + buffer[i] = static_cast(murand(&localState) % max_val); + } #else curand_init(local_seed, id, increment, &localState); CUDA_KERNEL_LOOP(i, n) { diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h index e100397924af5..9e562cbf58dfe 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h @@ -39,7 +39,9 @@ template class CSyncCalcStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA)) && \ + !defined(_WIN32) auto place = ctx.GetPlace(); auto dev_ctx = static_cast( diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc index bacbe014a343c..e5c918f0be9d0 100644 --- a/paddle/fluid/operators/collective/c_wait_comm_op.cc +++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc @@ -60,6 +60,9 @@ class CWaitCommOp : public framework::OperatorBase { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc index 34569b0a4b600..5276b1b15bcf8 100644 --- a/paddle/fluid/operators/collective/c_wait_compute_op.cc +++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc @@ -61,6 +61,9 @@ class CWaitComputeOp : public framework::OperatorBase { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index 0f04a295ed263..7db9932d99a98 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -77,7 +77,8 @@ class ConditionalOp : public framework::OperatorBase { ips[0]->numel())); bool res = false; if (platform::is_gpu_place(ips[0]->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::DenseTensor cpu_tensor; framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index c2deeb4190986..e0748c008a564 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -218,7 +218,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE( ALL_LAYOUT, paddle::operators::FeedSparseCooTensorKernel) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE( feed_sparse_coo_tensor, GPU, diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc index 9f67b1d4b6e18..9bbe605c8ccb6 100644 --- a/paddle/fluid/operators/controlflow/get_places_op.cc +++ b/paddle/fluid/operators/controlflow/get_places_op.cc @@ -26,7 +26,8 @@ namespace imperative { class OpBase; } // namespace imperative } // namespace paddle -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -34,7 +35,8 @@ namespace paddle { namespace operators { static size_t CUDADevCount() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) return platform::GetGPUDeviceCount(); #else return 0UL; diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 6ae32f33e957a..3d25edfe2e130 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -228,7 +228,8 @@ bool GetCondData(const phi::DenseTensor &cond) { // platform::is_xpu_place(cond.place()) is true std::unique_ptr cpu_cond{new phi::DenseTensor()}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) || \ + defined(PADDLE_WITH_CUSTOM_DEVICE) framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index adb60a8a8d064..c20b691a4300c 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -19,6 +19,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index b2bbd9c82095c..eba1c5127b8a9 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -12,6 +12,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h index 484bd8454bae9..b01813a0cfc27 100644 --- a/paddle/fluid/operators/detection/target_assign_op.h +++ b/paddle/fluid/operators/detection/target_assign_op.h @@ -120,7 +120,8 @@ class TargetAssignKernel : public framework::OpKernel { int64_t k = x->dims()[2]; auto x_lod = x->lod().back(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::MixVector mixv_x_lod(&x_lod); size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace()); #else @@ -137,7 +138,8 @@ class TargetAssignKernel : public framework::OpKernel { k, out_data, out_wt_data); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) mixv_x_lod.CopyToCPU(); #endif @@ -154,7 +156,8 @@ class TargetAssignKernel : public framework::OpKernel { "TargetAssignOp input(NegIndices) needs 1 level of LoD")); const int* neg_idx_data = neg_indices->data(); auto neg_lod = neg_indices->lod().back(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::MixVector mixv_neg_lod(&neg_lod); size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace()); #else @@ -170,7 +173,8 @@ class TargetAssignKernel : public framework::OpKernel { mismatch_value, out_data, out_wt_data); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) mixv_neg_lod.CopyToCPU(); #endif } diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h index 45f34313d1a3d..c0799c4c861c4 100644 --- a/paddle/fluid/operators/dgc_op.h +++ b/paddle/fluid/operators/dgc_op.h @@ -188,7 +188,8 @@ class DGCOpKernel : public framework::OpKernel { int buf_size = paddle::communication::dgc::get_buffer_size(k); paddle::memory::allocation::AllocationPtr tmp_ious_data; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(dev_ctx.GetPlace())) { tmp_ious_data = memory::Alloc( dev_ctx.GetPlace(), diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index c69acb89750c9..1feb5a5e1fc71 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -32,9 +32,11 @@ limitations under the License. */ #include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/cpu/elementwise_grad.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #ifdef __NVCC__ #include +#elif defined(__MUSACC__) +#include #elif defined(__HIPCC__) #include #endif @@ -311,7 +313,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x, } } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template GetReduceDim(const framework::DDim &in, return phi::funcs::GetReduceDim(in, out, axis); } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template void GetGradXAndYOut(const phi::GPUContext &dev_ctx, diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc index 107fe9f6174b6..82195e874f1b6 100644 --- a/paddle/fluid/operators/expand_as_op.cc +++ b/paddle/fluid/operators/expand_as_op.cc @@ -155,7 +155,8 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad, ops::ExpandAsGradKernel, ops::ExpandAsGradKernel, ops::ExpandAsGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) REGISTER_OP_CUDA_KERNEL(expand_as, ops::ExpandAsKernel, ops::ExpandAsKernel, diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index fee4b47049301..5cb29c1d48dad 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -283,7 +283,8 @@ REGISTER_OP_CPU_KERNEL(expand_grad, ops::ExpandGradKernel, ops::ExpandGradKernel, ops::ExpandGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) REGISTER_OP_CUDA_KERNEL( expand, ops::ExpandKernel, diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h index b6dd3ca8f64b2..507cbc0d31d3a 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu.h +++ b/paddle/fluid/operators/fake_quantize_op.cu.h @@ -192,6 +192,8 @@ struct FindChannelAbsMaxFunctor { #ifdef PADDLE_WITH_HIP hipMemset(out_abs_max, 0, sizeof(T) * cout); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(out_abs_max, 0, sizeof(T) * cout); #else cudaMemset(out_abs_max, 0, sizeof(T) * cout); #endif // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_ diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 53001b2493084..db5c1ddcbb375 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -17,6 +17,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu index 35574331e17d7..65d4ae2d4c5ec 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu @@ -46,6 +46,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel { int device_id; #ifdef PADDLE_WITH_HIP hipGetDevice(&device_id); +#elif defined(PADDLE_WITH_MUSA) + musaGetDevice(&device_id); #else cudaGetDevice(&device_id); #endif @@ -76,6 +78,17 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel { sizeof(int64_t) * input_num, hipMemcpyHostToDevice, device_ctx.stream()); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpyAsync(in_ids_d, + in1s.data(), + sizeof(int64_t) * input_num, + musaMemcpyHostToDevice, + device_ctx.stream()); + musaMemcpyAsync(in_embs_d, + in2s.data(), + sizeof(int64_t) * input_num, + musaMemcpyHostToDevice, + device_ctx.stream()); #else cudaMemcpyAsync(in_ids_d, in1s.data(), diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index dee676a7640f4..4eea6ab366fb6 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -15,6 +15,9 @@ limitations under the License. */ #ifdef __NVCC__ #include #endif +#ifdef __MUSACC__ +#include +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu index 362860aa23bdf..89838e6084ab3 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu @@ -150,6 +150,34 @@ void FusedSeqpoolCVM(const framework::ExecutionContext lods.size() * sizeof(size_t *), hipMemcpyHostToDevice, stream); +#elif defined(PADDLE_WITH_MUSA) + T **gpu_input_values = reinterpret_cast(temp_ptr->ptr()); + platform::GpuMemcpyAsync(gpu_input_values, + input_data.data(), + input_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + T **gpu_output_values = + reinterpret_cast(&gpu_input_values[input_data.size()]); + platform::GpuMemcpyAsync(gpu_output_values, + output_data.data(), + output_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + T **gpu_seqpool_output_values = + reinterpret_cast(&gpu_output_values[output_data.size()]); + platform::GpuMemcpyAsync(gpu_seqpool_output_values, + seqpool_output_data.data(), + seqpool_output_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + size_t **lods_values = reinterpret_cast( + &gpu_seqpool_output_values[seqpool_output_data.size()]); + platform::GpuMemcpyAsync(lods_values, + lods.data(), + lods.size() * sizeof(size_t *), + musaMemcpyHostToDevice, + stream); #else T **gpu_input_values = reinterpret_cast(temp_ptr->ptr()); platform::GpuMemcpyAsync(gpu_input_values, @@ -356,6 +384,37 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx, lods.size() * sizeof(size_t *), hipMemcpyHostToDevice, stream); +#elif defined(PADDLE_WITH_MUSA) + T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); + platform::GpuMemcpyAsync(gpu_out_grads_values, + out_grads_data.data(), + out_grads_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + + T **gpu_in_grads_values = + reinterpret_cast(&gpu_out_grads_values[out_grads_data.size()]); + platform::GpuMemcpyAsync(gpu_in_grads_values, + in_grads_data.data(), + in_grads_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + + T **gpu_cvm_values = + reinterpret_cast(&gpu_in_grads_values[in_grads_data.size()]); + platform::GpuMemcpyAsync(gpu_cvm_values, + cvm_data.data(), + cvm_data.size() * sizeof(T *), + musaMemcpyHostToDevice, + stream); + + size_t **lods_values = + reinterpret_cast(&gpu_cvm_values[cvm_data.size()]); + platform::GpuMemcpyAsync(lods_values, + lods.data(), + lods.size() * sizeof(size_t *), + musaMemcpyHostToDevice, + stream); #else T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); platform::GpuMemcpyAsync(gpu_out_grads_values, diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu index 8402bc78ef64c..5ee1ce015386f 100644 --- a/paddle/fluid/operators/fused/multihead_matmul_op.cu +++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu @@ -329,6 +329,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T)); #ifdef PADDLE_WITH_HIP hipMemset(temp_qk_bias, 0, sizeof(float) * size); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(temp_qk_bias, 0, sizeof(float) * size); #else cudaMemset(temp_qk_bias, 0, sizeof(float) * size); #endif diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu index 72bb97a2aae9e..72077400d5d2a 100644 --- a/paddle/fluid/operators/fused/yolo_box_post_op.cu +++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu @@ -255,6 +255,9 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipMemcpy( bbox_count_device_ptr, &bbox_count, sizeof(int), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + bbox_count_device_ptr, &bbox_count, sizeof(int), musaMemcpyHostToDevice); #else cudaMemcpy( bbox_count_device_ptr, &bbox_count, sizeof(int), cudaMemcpyHostToDevice); @@ -268,6 +271,9 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipMemcpy( &bbox_count, bbox_count_device_ptr, sizeof(int), hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + &bbox_count, bbox_count_device_ptr, sizeof(int), musaMemcpyDeviceToHost); #else cudaMemcpy( &bbox_count, bbox_count_device_ptr, sizeof(int), cudaMemcpyDeviceToHost); @@ -283,6 +289,9 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipFree(bbox_tensor); hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); +#elif defined(PADDLE_WITH_MUSA) + musaFree(bbox_tensor); + musaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); #else cudaFree(bbox_tensor); cudaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float)); @@ -296,6 +305,9 @@ static void YoloTensorParseCuda( #ifdef PADDLE_WITH_HIP hipMemcpy( bbox_index_device_ptr, &bbox_index, sizeof(int), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpy( + bbox_index_device_ptr, &bbox_index, sizeof(int), musaMemcpyHostToDevice); #else cudaMemcpy( bbox_index_device_ptr, &bbox_index, sizeof(int), cudaMemcpyHostToDevice); @@ -356,6 +368,13 @@ class YoloBoxPostKernel : public framework::OpKernel { anchors.data(), anchors.size() * sizeof(int), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&device_anchors), + anchors.size() * sizeof(int)); + musaMemcpy(device_anchors, + anchors.data(), + anchors.size() * sizeof(int), + musaMemcpyHostToDevice); #else cudaMalloc(reinterpret_cast(&device_anchors), anchors.size() * sizeof(int)); @@ -388,6 +407,10 @@ class YoloBoxPostKernel : public framework::OpKernel { hipMalloc( reinterpret_cast(&ts_info[i].bboxes_dev_ptr), ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc( + reinterpret_cast(&ts_info[i].bboxes_dev_ptr), + ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)); #else cudaMalloc( reinterpret_cast(&ts_info[i].bboxes_dev_ptr), @@ -398,6 +421,9 @@ class YoloBoxPostKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP hipMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), sizeof(int)); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), + sizeof(int)); #else cudaMalloc(reinterpret_cast(&ts_info[i].bbox_count_device_ptr), sizeof(int)); @@ -409,6 +435,8 @@ class YoloBoxPostKernel : public framework::OpKernel { int* bbox_index_device_ptr; #ifdef PADDLE_WITH_HIP hipMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); +#elif defined(PADDLE_WITH_MUSA) + musaMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); #else cudaMalloc(reinterpret_cast(&bbox_index_device_ptr), sizeof(int)); #endif @@ -456,6 +484,12 @@ class YoloBoxPostKernel : public framework::OpKernel { ts_info[ts_id].bboxes_dev_ptr, ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float), hipMemcpyDeviceToHost); +#elif defined(PADDLE_WITH_MUSA) + musaMemcpyAsync( + ts_info[ts_id].bboxes_host_ptr, + ts_info[ts_id].bboxes_dev_ptr, + ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float), + musaMemcpyDeviceToHost); #else cudaMemcpyAsync( ts_info[ts_id].bboxes_host_ptr, @@ -534,6 +568,8 @@ class YoloBoxPostKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP hipFree(bbox_index_device_ptr); +#elif defined(PADDLE_WITH_MUSA) + musaFree(bbox_index_device_ptr); #else cudaFree(bbox_index_device_ptr); #endif @@ -541,6 +577,9 @@ class YoloBoxPostKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP hipFree(ts_info[i].bboxes_dev_ptr); hipFree(ts_info[i].bbox_count_device_ptr); +#elif defined(PADDLE_WITH_MUSA) + musaFree(ts_info[i].bboxes_dev_ptr); + musaFree(ts_info[i].bbox_count_device_ptr); #else cudaFree(ts_info[i].bboxes_dev_ptr); cudaFree(ts_info[i].bbox_count_device_ptr); diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu index 32e7cffa4984b..4065fd1e017ea 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -31,6 +31,10 @@ limitations under the License. */ #include #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif #ifdef PADDLE_WITH_HIP #include #include diff --git a/paddle/fluid/operators/fused_token_prune_op.cu b/paddle/fluid/operators/fused_token_prune_op.cu index 8f0a53611f3b2..4ff5fd33df3d6 100644 --- a/paddle/fluid/operators/fused_token_prune_op.cu +++ b/paddle/fluid/operators/fused_token_prune_op.cu @@ -14,6 +14,9 @@ limitations under the License. */ #ifdef __NVCC__ #include #endif +#ifdef __MUSACC__ +#include +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc index 8ae92b04b7df4..853540f7a2b9b 100644 --- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -111,7 +111,8 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index e533960c8a648..0e96f7164e913 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -32,6 +32,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include #include +#elif defined(PADDLE_WITH_MUSA) +#include +#include #else #include #include @@ -95,6 +98,12 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng); +#elif defined(PADDLE_WITH_MUSA) + murandState rng; + murand_init(rand_seed * gridDim.x + blockIdx.x, + threadIdx.y * WARP_SIZE + threadIdx.x, + 0, + &rng); #else curandState rng; curand_init(rand_seed * gridDim.x + blockIdx.x, @@ -128,6 +137,8 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) { #ifdef PADDLE_WITH_HIP const int num = hiprand(&rng) % (idx + 1); +#elif defined(PADDLE_WITH_MUSA) + const int num = murand(&rng) % (idx + 1); #else const int num = curand(&rng) % (idx + 1); #endif diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index dea3ce3fe695b..01d9642a49404 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -156,7 +156,8 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL( hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index 8c123bb8a32f2..56fa8cfc4b0cd 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -201,7 +201,8 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL( im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h index aab7953d6d103..b6a8b52c04083 100644 --- a/paddle/fluid/operators/isfinite_op.h +++ b/paddle/fluid/operators/isfinite_op.h @@ -67,7 +67,8 @@ bool TensorIsfinite(const phi::DenseTensor& tensor); FiniteVisitor(Isnan, Any, CPU); FiniteVisitor(Isinf, Any, CPU); FiniteVisitor(Isfinite, All, CPU); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) FiniteVisitor(Isnan, Any, GPU); FiniteVisitor(Isinf, Any, GPU); FiniteVisitor(Isfinite, All, GPU); @@ -82,7 +83,8 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor, IsnanVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsnanVisitorGPU(tensor, out)); @@ -99,7 +101,8 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor, IsinfVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsinfVisitorGPU(tensor, out)); @@ -116,7 +119,8 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor, IsfiniteVisitorCPU(tensor, out)); return; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place)) { VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()), IsfiniteVisitorGPU(tensor, out)); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index 92f190c0025ed..c859183fd9661 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -96,7 +96,8 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {} PD_REGISTER_STRUCT_KERNEL( l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {} PD_REGISTER_STRUCT_KERNEL( l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {} diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index dd85ccff87f2d..fc3845703bef4 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -133,7 +133,8 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {} PD_REGISTER_KERNEL( load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {} PD_REGISTER_KERNEL( load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index 94b0319729117..05a6a5c86831c 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -66,7 +66,8 @@ struct LoDTensorToArrayFunctor { if (std::is_same::value) { Apply(static_cast(dev_ctx)); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) Apply(static_cast(dev_ctx)); #else PADDLE_THROW( diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 11c35293ebe34..b1282585bda6e 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -221,6 +221,9 @@ struct LookupTableV2GradCUDAFunctor { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index d741bc5b42549..4829c8f6c46c9 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -16,6 +16,8 @@ #ifdef PADDLE_WITH_HIP #include namespace cub = hipcub; +#elif defined(PADDLE_WITH_MUSA) +#include #else #include #endif diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index af14333b9d1ea..7b14e8541fd02 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -21,5 +21,6 @@ endif() math_library(unpooling) math_library(prelu) -math_library(bert_encoder_functor) +# TODO(@caizhi): enable it +#math_library(bert_encoder_functor) math_library(tree2col DEPS phi) diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h index 9a0b5a1ae3ab7..91fdcf82e83d0 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.h +++ b/paddle/fluid/operators/math/bert_encoder_functor.h @@ -20,6 +20,10 @@ limitations under the License. */ #include // NOLINT #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif #ifdef PADDLE_WITH_HIP #include @@ -47,7 +51,8 @@ struct CUDATypeTraits { typedef float TYPE; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // This functor involves a fusion calculation in Ernie or Bert. // The fusion mode is as follows: // diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 3032b78a2029d..0d8049023d2cd 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -17,6 +17,9 @@ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h index 00ff1fbcbc38d..04e390499cb7f 100644 --- a/paddle/fluid/operators/math/prelu.h +++ b/paddle/fluid/operators/math/prelu.h @@ -23,7 +23,8 @@ namespace paddle { namespace operators { namespace math { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template class PreluChannelWiseDirectCUDAFunctor { public: diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index 0c6b49729546c..5ab90409df1e7 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -160,6 +160,11 @@ void GPUSampleWithProb::operator()(const phi::GPUContext& context, s_data, sizeof(int64_t) * num_samples, hipMemcpyHostToDevice)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(samples_data + num_true, + s_data, + sizeof(int64_t) * num_samples, + musaMemcpyHostToDevice)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true, s_data, diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h index 7c60be6841552..d1487d9c57360 100644 --- a/paddle/fluid/operators/math/sample_prob.h +++ b/paddle/fluid/operators/math/sample_prob.h @@ -106,7 +106,8 @@ class SampleWithProb { } }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template class GPUSampleWithProb { public: diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index e1a36fa41894d..af41335bffa86 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -75,7 +75,7 @@ class MatMulKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) head_number = context.Attr("head_number"); #endif @@ -89,7 +89,7 @@ class MatMulKernel : public framework::OpKernel { } } #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_); if (head_number > 1) { @@ -241,7 +241,7 @@ class MatMulGradKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) if (context.HasAttr("head_number")) { head_number = context.Attr("head_number"); } @@ -373,7 +373,7 @@ class MatMulDoubleGradKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) head_number = context.Attr("head_number"); #endif @@ -615,7 +615,7 @@ class MatMulOp : public framework::OperatorWithKernel { } int64_t dim_out_y = mat_dim_y.width_; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) int head_number = context->Attrs().Get("head_number"); bool split_vertical_y = (mat_dim_x.width_ != mat_dim_y.height_); if (context->IsRuntime()) { @@ -758,7 +758,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { .AsExtra(); #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) AddAttr("head_number", "The number of heads of the matrix") .SetDefault(1); #endif @@ -926,7 +926,8 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad, ops::MatMulDoubleGradKernel, ops::MatMulDoubleGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) REGISTER_OP_CUDA_KERNEL( matmul, ops::MatMulKernel, diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 5f480461d77cd..fff8b36d68405 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -39,7 +39,8 @@ class MemcpyH2DFunctor { void operator()(const phi::DenseTensor &lod_tensor) const { auto &out_tensor = *out_->GetMutable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto stream = static_cast(&dev_ctx_)->stream(); #else auto stream = nullptr; diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index 007f853f3243f..afa281f3679cc 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -68,7 +68,8 @@ class MergeLoDTensorOp : public framework::OperatorBase { if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) framework::TensorCopy( mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc index 8c33a5da1baff..1de4eed001b92 100644 --- a/paddle/fluid/operators/minus_op.cc +++ b/paddle/fluid/operators/minus_op.cc @@ -157,6 +157,7 @@ REGISTER_OPERATOR(minus, ops::MinusGradMaker); PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {} #endif diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h index 01905d8ca84b3..70342339a55a1 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.h +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h @@ -25,6 +25,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #ifdef PADDLE_WITH_RCCL #include "paddle/fluid/platform/dynload/rccl.h" +#elif defined(PADDLE_WITH_MCCL) +#include "paddle/fluid/platform/dynload/mccl.h" #else #include "paddle/fluid/platform/dynload/nccl.h" #endif diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc index 69f0bfb2abcd3..cc3bbe8eac3ac 100644 --- a/paddle/fluid/operators/nop_op.cc +++ b/paddle/fluid/operators/nop_op.cc @@ -60,6 +60,7 @@ REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker); PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {} #endif diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index cad7e38ba1c1a..411988f4f0560 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -32,6 +32,10 @@ #include "cub/cub.cuh" #include "math.h" // NOLINT #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#include "math.h" // NOLINT +#endif #ifdef __HIPCC__ #include @@ -53,6 +57,8 @@ static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) { static_assert(!std::is_same::value, "T cannot be void."); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(x, 0, n * sizeof(T), stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream)); #endif @@ -254,6 +260,10 @@ static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) { PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( &cpu_value, ptr, sizeof(float), hipMemcpyDeviceToHost, stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync( + &cpu_value, ptr, sizeof(float), musaMemcpyDeviceToHost, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( &cpu_value, ptr, sizeof(float), cudaMemcpyDeviceToHost, stream)); @@ -1133,6 +1143,10 @@ static std::string GetMinMaxStr(const T *x, size_t n, const phi::Place &place) { PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( &ret_cpu[0], ret, 2 * sizeof(T), hipMemcpyDeviceToHost, stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync( + &ret_cpu[0], ret, 2 * sizeof(T), musaMemcpyDeviceToHost, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( &ret_cpu[0], ret, 2 * sizeof(T), cudaMemcpyDeviceToHost, stream)); @@ -1189,6 +1203,12 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) { sizeof(flag), hipMemcpyDeviceToHost, dev_ctx.stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(&flag, + out.Get(), + sizeof(flag), + musaMemcpyDeviceToHost, + dev_ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&flag, out.Get(), diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h index f1b162be46610..1f3ae2f9e318e 100644 --- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h +++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h @@ -28,6 +28,9 @@ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; @@ -460,7 +463,7 @@ class SparseMomentumOpKernel : public framework::OpKernel { grad_index.mutable_data({num_index}, ctx.GetPlace()); if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) auto sort_value_ptr = sort_value.mutable_data({num_index}, ctx.GetPlace()); diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index d00cefab45045..500c375212bf9 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -260,7 +260,8 @@ PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(pad_constant_like, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h index e2417a071ce88..a10f59f8a2fbe 100644 --- a/paddle/fluid/operators/prroi_pool_op.h +++ b/paddle/fluid/operators/prroi_pool_op.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include "paddle/phi/backends/gpu/gpu_primitives.h" #endif @@ -85,7 +85,7 @@ inline HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data, return sum_out; } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template DEVICE void PrRoIPoolingDistributeDiff(T* diff, const T top_diff, @@ -163,7 +163,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff, PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp); } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template DEVICE void AccumulateRois(T* offset, T data) { phi::CudaAtomicAdd(offset, data); @@ -175,7 +175,7 @@ inline HOSTDEVICE void AccumulateRois(T* offset, T data) { } #endif -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template DEVICE T MaxFunctor(const T x, const T y) { return max(x, y); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc index 99e8d04a9e329..eccc679666c58 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc @@ -107,7 +107,8 @@ PD_REGISTER_STRUCT_KERNEL(send_and_recv, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL(send_and_recv, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index fc625826b9a91..91f76d2525de3 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -19,7 +19,8 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/for_range.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include #endif @@ -37,7 +38,8 @@ struct Random { using UniformIntDist = std::uniform_int_distribution; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template <> struct Random { using Engine = thrust::minstd_rand; diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index ebdddfd41b33f..712aac0e50716 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -246,7 +246,8 @@ PD_REGISTER_STRUCT_KERNEL( PD_REGISTER_STRUCT_KERNEL( rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_STRUCT_KERNEL( rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {} PD_REGISTER_STRUCT_KERNEL( diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index a0ad7e3939a02..99caa24a51078 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -48,7 +48,8 @@ BufferedReader::BufferedReader( buffer_size_(buffer_size), pin_memory_(pin_memory) { VLOG(1) << "BufferedReader"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(place_) && !pin_memory) { int dev_idx = place_.device; compute_stream_ = @@ -118,7 +119,8 @@ void BufferedReader::ReadAsync(size_t i) { return -1UL; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // @{ Group GPU Place +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // @{ Group GPU Place if (platform::is_gpu_place(place_)) { TensorVec &cuda = cuda_buffer_[i]; if (cuda.empty()) { @@ -197,6 +199,11 @@ void BufferedReader::ReadAsync(size_t i) { hipEventRecord(events_[i].get(), compute_stream_)); PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream_.get(), events_[i].get(), 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventRecord(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamWaitEvent(stream_.get(), events_[i].get(), 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(events_[i].get(), compute_stream_)); diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 032a74b7e23f1..ff902cc66445b 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -21,7 +21,8 @@ #include "ThreadPool.h" #include "paddle/fluid/framework/reader.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #endif @@ -80,7 +81,8 @@ class BufferedReader : public framework::DecoratedReader { std::vector xpu_buffer_; std::vector custom_device_buffer_; size_t prev_pos_{-1UL}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gpuStream_t compute_stream_; std::shared_ptr stream_; std::vector> events_; diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 962b18c995979..b14eef3f29beb 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -429,7 +429,8 @@ class ReshapeKernel { pt_scalar_shape, out); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeInferKernel(static_cast(dev_ctx), @@ -462,7 +463,8 @@ class ReshapeGradKernel { phi::ReshapeGradKernel( static_cast(dev_ctx), *d_out, d_x); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeGradKernel( @@ -492,7 +494,8 @@ class ReshapeDoubleGradKernel { phi::ReshapeDoubleGradKernel( static_cast(dev_ctx), *d_out, *dd_x, dd_out); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); phi::ReshapeDoubleGradKernel( @@ -761,7 +764,8 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::ReshapeDoubleGradOpNoNeedBufferVarInferer, Reshape2DoubleGradInferShapeFunctor); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index bc1f5a0d34f60..e786ea83fad73 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -117,7 +117,8 @@ PD_REGISTER_KERNEL(save_sr, phi::dtype::float16, phi::dtype::bfloat16) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(save, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h index 2b7f884f6170c..4843492101b05 100644 --- a/paddle/fluid/operators/select_op_helper.h +++ b/paddle/fluid/operators/select_op_helper.h @@ -39,8 +39,9 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) { } // when platform::is_gpu_place(mask.place()) is true std::unique_ptr cpu_mask{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE) || \ + defined(PADDLE_WITH_XPU) framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h index 2236988025cbc..4a715d0e35972 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h @@ -136,7 +136,8 @@ class SequenceReverseOpKernel : public framework::OpKernel { const size_t *lod; size_t lod_count = x.lod()[0].size(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_gpu_place(ctx.GetPlace())) { auto xlod = x.lod()[0]; phi::MixVector mixv_xlod(&xlod); @@ -144,7 +145,8 @@ class SequenceReverseOpKernel : public framework::OpKernel { } else { #endif lod = x.lod()[0].data(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index 0ca5514900d46..77f729e0f91ca 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -16,7 +16,8 @@ limitations under the License. */ #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu index 897ff207f5eca..7411ecc05358c 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu @@ -18,6 +18,10 @@ limitations under the License. */ #include #endif +#ifdef __MUSACC__ +#include +#endif + #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu index 5069cf1e512cb..c4235a17f9918 100644 --- a/paddle/fluid/operators/shuffle_batch_op.cu +++ b/paddle/fluid/operators/shuffle_batch_op.cu @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifndef _MSC_VER #include diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index e648575a1edca..27f947f434a07 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -69,7 +69,8 @@ class SplitLoDTensorOp : public framework::OperatorBase { if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) framework::TensorCopy( mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index ccf5cd09a0842..e0004f197cd55 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -478,7 +478,7 @@ struct DeviceIndependenceTensorOperations { std::vector out_shape = GetBroadcastShape({&x, &y}); ret.Resize(phi::make_ddim(out_shape)); if (platform::is_gpu_place(context.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) // For GPU, there is no need to define XxxInverseFunctor and call // ElementwiseComputeEx in two branches. ElementwiseComputeEx, DeviceContext, InT>( diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu index 84e30250f85fd..1b24ea8276e24 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu +++ b/paddle/fluid/operators/sync_batch_norm_op.cu @@ -320,6 +320,24 @@ PD_REGISTER_KERNEL(sync_batch_norm, kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); } } +#elif defined(PADDLE_WITH_MUSA) +PD_REGISTER_KERNEL(sync_batch_norm, + GPU, + ALL_LAYOUT, + phi::SyncBatchNormKernel, + float, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} #else #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(sync_batch_norm, @@ -376,6 +394,18 @@ PD_REGISTER_KERNEL(sync_batch_norm_grad, kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // bias_grad } } +#elif defined(PADDLE_WITH_MUSA) +PD_REGISTER_KERNEL(sync_batch_norm_grad, + GPU, + ALL_LAYOUT, + phi::SyncBatchNormGradKernel, + float, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // scale_grad + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // bias_grad + } +} #else #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(sync_batch_norm_grad, @@ -404,6 +434,12 @@ PD_REGISTER_KERNEL(sync_batch_norm_coo, phi::sparse::SyncBatchNormCooKernel, float, phi::dtype::float16) {} +#elif defined(PADDLE_WITH_MUSA) +PD_REGISTER_KERNEL(sync_batch_norm_coo, + GPU, + ALL_LAYOUT, + phi::sparse::SyncBatchNormCooKernel, + float, #else PD_REGISTER_KERNEL(sync_batch_norm_coo, GPU, @@ -421,6 +457,12 @@ PD_REGISTER_KERNEL(sync_batch_norm_coo_grad, phi::sparse::SyncBatchNormCooGradKernel, float, phi::dtype::float16) {} +#elif defined(PADDLE_WITH_MUSA) +PD_REGISTER_KERNEL(sync_batch_norm_coo_grad, + GPU, + ALL_LAYOUT, + phi::sparse::SyncBatchNormCooGradKernel, + float, #else PD_REGISTER_KERNEL(sync_batch_norm_coo_grad, GPU, diff --git a/paddle/fluid/operators/sync_batch_norm_utils.h b/paddle/fluid/operators/sync_batch_norm_utils.h index 7c14f6dfac324..ebc825b66a5ef 100644 --- a/paddle/fluid/operators/sync_batch_norm_utils.h +++ b/paddle/fluid/operators/sync_batch_norm_utils.h @@ -22,6 +22,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include namespace cub = hipcub; diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index f1674bc5005a0..fede7fe5156d0 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -18,6 +18,9 @@ limitations under the License. */ #ifdef __NVCC__ #include "cub/cub.cuh" #endif +#ifdef __MUSACC__ +#include "cub/cub.cuh" +#endif #ifdef __HIPCC__ #include #endif diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index 16bce515f2a7f..12725c397faf6 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include #include "paddle/phi/core/generator.h" @@ -113,7 +113,7 @@ inline std::vector GetNewDataFromShapeTensorList( return vec_new_shape; } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template struct UniformGenerator { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 4d7f496aaa42d..8b7c77d720fed 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -64,7 +64,9 @@ if(WITH_DGC) set(dgc_deps dgc) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU + OR WITH_ROCM + OR WITH_MUSA) set(GPU_CTX_DEPS dynload_cuda dynamic_loader) endif() @@ -90,8 +92,14 @@ if(WITH_ROCM) SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) endif() +if(WITH_MUSA) + musa_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS + simple_threadpool enforce) +endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU + OR WITH_ROCM + OR WITH_MUSA) set(STREAM_CALLBACK_DEPS stream_callback_manager) else() set(STREAM_CALLBACK_DEPS) @@ -137,7 +145,9 @@ cc_library( SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU + OR WITH_ROCM + OR WITH_MUSA) target_link_libraries(device_context gpu_resource_pool) endif() @@ -235,6 +245,13 @@ if(WITH_ROCM) DEPS device_context gpu_info) endif() +if(WITH_MUSA) + musa_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base) + set(DEVICE_EVENT_LIBS + device_event_gpu + CACHE INTERNAL "device event libs") +endif() + cc_library(timer SRCS timer.cc) cc_test( timer_test @@ -281,6 +298,20 @@ elseif(WITH_ROCM) stats op_proto_maker shape_inference) +elseif(WITH_MUSA) + musa_library( + profiler + SRCS + profiler.cc + profiler.cu + DEPS + phi + gpu_info + enforce + new_profiler + stats + op_proto_maker + shape_inference) elseif(WITH_XPU) cc_library( profiler @@ -339,6 +370,10 @@ if(WITH_GPU) DEPS gpu_info) endif() +if(WITH_MUSA) + musa_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) +endif() + if(WITH_ROCM) hip_test( float16_gpu_test diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index b133a57d523ac..a6c2b9d61dd2b 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -174,6 +174,8 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( for (int i = 0; i < kDevices; i++) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i)); #endif diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu index b814bcde6841f..b78e4332d4bb0 100644 --- a/paddle/fluid/platform/complex_test.cu +++ b/paddle/fluid/platform/complex_test.cu @@ -27,7 +27,8 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt index 6f0d86f0a4b17..10f7143028225 100644 --- a/paddle/fluid/platform/device/CMakeLists.txt +++ b/paddle/fluid/platform/device/CMakeLists.txt @@ -1,7 +1,9 @@ set(DEV_LIBS custom_device) # GPU -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU + OR WITH_ROCM + OR WITH_MUSA) add_subdirectory(gpu) endif() diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h index aa2dba03c9082..4a984cb34aae8 100644 --- a/paddle/fluid/platform/device/device_wrapper.h +++ b/paddle/fluid/platform/device/device_wrapper.h @@ -16,7 +16,8 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt index 897f8d3732b73..85a86ae8ecedd 100644 --- a/paddle/fluid/platform/device/gpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt @@ -22,6 +22,17 @@ elseif(WITH_ROCM) cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) +elseif(WITH_MUSA) + musa_library( + gpu_info + SRCS + gpu_info.cc + DEPS + phi + glog + enforce + monitor + dynload_cuda) endif() cc_library( diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h index 878a122a49224..e6cac0e084ee5 100644 --- a/paddle/fluid/platform/device/gpu/gpu_helper.h +++ b/paddle/fluid/platform/device/gpu/gpu_helper.h @@ -13,11 +13,12 @@ // limitations under the License. #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h" -#else +#elif defined(PADDLE_WITH_CUDA) #include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h" #include "paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h" #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 7f1f2c76bd630..ea85562ababb6 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -35,6 +35,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/miopen.h" +#elif defined(PADDLE_WITH_MUSA) +// TODO(Xiaokang Shang) #else #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/phi/backends/gpu/cuda/cuda_graph.h" @@ -216,6 +218,12 @@ class RecordedGpuMallocHelper { } else { result = hipMalloc(ptr, size); } +#elif defined(PADDLE_WITH_MUSA) + if (UNLIKELY(malloc_managed_memory)) { + result = musaMallocManaged(ptr, size); + } else { + result = musaMalloc(ptr, size); + } #else phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard; if (UNLIKELY(malloc_managed_memory)) { @@ -262,6 +270,9 @@ class RecordedGpuMallocHelper { #ifdef PADDLE_WITH_HIP auto err = hipFree(ptr); if (err != hipErrorDeinitialized) { +#elif defined(PADDLE_WITH_MUSA) + auto err = musaFree(ptr); + if (err != musaErrorInvalidValue) { #else auto err = cudaFree(ptr); VLOG(10) << "[cudaFree] size=" << static_cast(size) / (1 << 20) @@ -309,6 +320,8 @@ class RecordedGpuMallocHelper { CUDADeviceGuard guard(dev_id_); #ifdef PADDLE_WITH_HIP auto result = hipMemGetInfo(actual_avail, actual_total); +#elif defined(PADDLE_WITH_MUSA) + auto result = musaMemGetInfo(actual_avail, actual_total); #else auto result = cudaMemGetInfo(actual_avail, actual_total); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h index de68329bba66d..3d76f09da559b 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -11,7 +11,8 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index d253a92c986ce..adde00d2f1b7a 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -16,10 +16,13 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include #else #include #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc index 9f2168e1cdb8b..7cf3659d596e9 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -30,6 +31,9 @@ CudaStreamResourcePool::CudaStreamResourcePool() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamCreateWithFlags(&stream, musaStreamNonBlocking)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); @@ -41,6 +45,8 @@ CudaStreamResourcePool::CudaStreamResourcePool() { platform::SetDeviceId(dev_idx); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); #endif @@ -82,6 +88,9 @@ CudaEventResourcePool::CudaEventResourcePool() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event, hipEventDisableTiming)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaEventCreateWithFlags(&event, musaEventDisableTiming)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); @@ -93,6 +102,8 @@ CudaEventResourcePool::CudaEventResourcePool() { platform::SetDeviceId(dev_idx); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h index 2ac13e692f783..298e795524b4a 100644 --- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h +++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h @@ -14,13 +14,19 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index c9afafdef7166..43a08c3c3b911 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -15,7 +15,8 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include @@ -23,6 +24,10 @@ #include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/rocblas.h" +#elif defined(PADDLE_WITH_MUSA) +#include +#include "paddle/fluid/platform/dynload/mublas.h" +using mudnnHandle_t = class Handle*; #else #include @@ -34,19 +39,49 @@ namespace paddle { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = ROCM_TYPE; + +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + using GPU_TYPE = MUSA_TYPE; #else // CDUA -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = CUDA_TYPE; #endif -DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t); -DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t); -DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t); -DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind); -DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t); +DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t); +DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t); +DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, + cudaMemcpyKind, + hipMemcpyKind, + musaMemcpyKind); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, + cudaDeviceProp, + hipDeviceProp_t, + musaDeviceProp); + +DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t); +DECLARE_TYPE_FOR_GPU(blasHandle_t, + cublasHandle_t, + rocblas_handle, + mublasHandle_t); + +using CUDAGraphID = unsigned long long; // NOLINT + +#undef DECLARE_TYPE_FOR_GPU + +// TODO(Xiaokang Shang): confirm mudnn type +#ifndef PADDLE_WITH_MUSA +#ifdef PADDLE_WITH_HIP +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = ROCM_TYPE; +#elif defined(PADDLE_WITH_CUDA) +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = CUDA_TYPE; +#endif DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, @@ -80,32 +115,33 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, miopenDropoutDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); - -DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); - // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workround. DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); - -using CUDAGraphID = unsigned long long; // NOLINT - #undef DECLARE_TYPE_FOR_GPU +#endif #ifdef PADDLE_WITH_HIP -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = ROCM_CV; +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ + constexpr auto GPU_CV = MUSA_CV; #else // CDUA -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = CUDA_CV; #endif DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, - hipErrorOutOfMemory); -DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); -DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); + hipErrorOutOfMemory, + musaErrorMemoryAllocation); +DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, + cudaErrorNotReady, + hipErrorNotReady, + musaErrorNotReady); +DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess); #undef DECLARE_CONSTANT_FOR_GPU } // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 6afcd2eb7cd97..be988cabdb5d1 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -32,6 +32,9 @@ #ifdef PADDLE_WITH_RCCL #include "paddle/fluid/platform/dynload/rccl.h" #endif +#ifdef PADDLE_WITH_MCCL +#include "paddle/fluid/platform/dynload/mccl.h" +#endif #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc index 6b58453f03ea8..3070de23ca219 100644 --- a/paddle/fluid/platform/device_code_test.cc +++ b/paddle/fluid/platform/device_code_test.cc @@ -45,7 +45,8 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) { )"; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) TEST(DeviceCode, cuda) { if (!phi::dynload::HasNVRTC() || !phi::dynload::HasCUDADriver()) { return; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 456abd55ef68f..fac5995371c8d 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -27,7 +27,8 @@ limitations under the License. */ #include "paddle/phi/core/expect.h" #include "paddle/phi/core/generator.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/phi/backends/gpu/gpu_context.h" @@ -53,7 +54,8 @@ DeviceType Place2DeviceType(const platform::Place& place) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template typename std::enable_if::value, DevCtx*>::type @@ -86,7 +88,8 @@ inline std::unique_ptr CreateDeviceContext( DevCtx* dev_ctx = ConstructDevCtx(p, stream_priority); auto& instance = paddle::memory::allocation::AllocatorFacade::Instance(); if (p.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto* cuda_ctx = dynamic_cast(dev_ctx); PADDLE_ENFORCE_NOT_NULL( cuda_ctx, @@ -172,7 +175,8 @@ void EmplaceDeviceContexts( /*unused*/ stream_priority); #endif } else if (place.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) EmplaceDeviceContext( place_to_device_context, place, @@ -209,7 +213,8 @@ void EmplaceDeviceContexts( "option.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) EmplaceDeviceContext( place_to_device_context, place, diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index b07b3f29dafde..0e54bab9a6871 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -42,6 +42,17 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif +#ifdef PADDLE_WITH_MUSA +#include "paddle/fluid/platform/device/gpu/gpu_helper.h" +#include "paddle/fluid/platform/dynload/mublas.h" +#include "paddle/fluid/platform/dynload/musparse.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#include "paddle/fluid/platform/dynload/mccl.h" +#endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#endif + #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT #include "paddle/fluid/platform/dynload/miopen.h" @@ -136,7 +147,8 @@ namespace xpu = baidu::xpu::api; using XPUDeviceContext = phi::XPUContext; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) using CUDAPinnedDeviceContext = phi::GPUPinnedContext; #endif @@ -165,7 +177,8 @@ struct DefaultDeviceContextType { }; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template <> struct DefaultDeviceContextType { using TYPE = paddle::platform::CUDAPinnedDeviceContext; diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index 402974b89e5c9..2287ffada5872 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -31,7 +31,8 @@ using ::paddle::platform::kXPU; USE_EVENT(kCPU) USE_EVENT_WAIT(kCPU, kCPU) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) USE_EVENT(kCUDA); USE_EVENT_WAIT(kCUDA, kCUDA) USE_EVENT_WAIT(kCPU, kCUDA) diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc index 37da8daf7fd69..f0bbb411abb89 100644 --- a/paddle/fluid/platform/device_event_gpu.cc +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -15,7 +15,8 @@ #include "paddle/fluid/platform/device_event_base.h" #include "paddle/fluid/platform/event.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) namespace paddle { namespace platform { struct CUDADeviceEventWrapper { diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 4cb3bfdb3adae..95a488e3b9dba 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -23,6 +23,10 @@ if(WITH_ROCM) list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc) endif() +if(WITH_MUSA) + list(APPEND MUSA_SRCS mublas.cc murand.cc musparse.cc) +endif() + # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows. if(NOT APPLE) @@ -39,6 +43,12 @@ if(NOT APPLE) list(APPEND HIP_SRCS cupti.cc) endif() endif() + if(WITH_MUSA) + list(APPEND MUSA_SRCS musa_driver.cc musartc.cc) + if(WITH_MCCL) + list(APPEND MUSA_SRCS mccl.cc) + endif() + endif() endif() if(TENSORRT_FOUND) @@ -62,6 +72,12 @@ if(WITH_ROCM) dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi) +elseif(WITH_MUSA) + musa_library(dynload_cuda SRCS ${MUSA_SRCS} DEPS dynamic_loader phi) + cc_library( + dynload_warpctc + SRCS warpctc.cc + DEPS dynamic_loader warpctc phi) else() nv_library( dynload_cuda diff --git a/paddle/fluid/platform/dynload/mccl.cc b/paddle/fluid/platform/dynload/mccl.cc new file mode 100644 index 0000000000000..ea5df00912dd4 --- /dev/null +++ b/paddle/fluid/platform/dynload/mccl.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/mccl.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mccl.h b/paddle/fluid/platform/dynload/mccl.h new file mode 100644 index 0000000000000..2f22f65d699d6 --- /dev/null +++ b/paddle/fluid/platform/dynload/mccl.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/mccl.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +#define MCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(mcclCommInitAll); \ + __macro(mcclGetUniqueId); \ + __macro(mcclCommInitRank); \ + __macro(mcclCommDestroy); \ + __macro(mcclCommCount); \ + __macro(mcclCommCuDevice); \ + __macro(mcclCommUserRank); \ + __macro(mcclAllReduce); \ + __macro(mcclBcast); \ + __macro(mcclAllGather); \ + __macro(mcclGroupStart); \ + __macro(mcclGroupEnd); \ + __macro(mcclReduce); \ + __macro(mcclReduceScatter); \ + __macro(mcclGetErrorString); \ + __macro(mcclBroadcast); \ + __macro(mcclGetVersion); \ + __macro(mcclSend); \ + __macro(mcclRecv); \ + __macro(mcclRedOpCreatePreMulSum); \ + __macro(mcclRedOpDestroy); + +MCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mublas.cc b/paddle/fluid/platform/dynload/mublas.cc new file mode 100644 index 0000000000000..ae98e1a5c01bd --- /dev/null +++ b/paddle/fluid/platform/dynload/mublas.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/mublas.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mublas.h b/paddle/fluid/platform/dynload/mublas.h new file mode 100644 index 0000000000000..d958d9ac7c9b6 --- /dev/null +++ b/paddle/fluid/platform/dynload/mublas.h @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include // NOLINT +#include + +#include "paddle/phi/backends/dynload/mublas.h" + +namespace paddle { +namespace platform { +namespace dynload { + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load mublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +#define MUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(mublasSaxpy); \ + __macro(mublasDaxpy); \ + __macro(mublasCaxpy); \ + __macro(mublasZaxpy); \ + __macro(mublasSscal); \ + __macro(mublasDscal); \ + __macro(mublasScopy); \ + __macro(mublasDcopy); \ + __macro(mublasSgemv); \ + __macro(mublasDgemv); \ + __macro(mublasCgemv); \ + __macro(mublasZgemv); \ + __macro(mublasSgemm); \ + __macro(mublasDgemm); \ + __macro(mublasCgemm); \ + __macro(mublasZgemm); \ + __macro(mublasHgemm); \ + __macro(mublasSgeam); \ + __macro(mublasDgeam); \ + __macro(mublasDtrsm); \ + __macro(mublasCtrsm); \ + __macro(mublasZtrsm); \ + __macro(mublasCreate); \ + __macro(mublasDestroy); \ + __macro(mublasSetStream); \ + __macro(mublasSetPointerMode); \ + __macro(mublasGetPointerMode); \ + __macro(mublasSgemmBatched); \ + __macro(mublasDgemmBatched); \ + __macro(mublasCgemmBatched); \ + __macro(mublasZgemmBatched); + +MUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) + +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/murand.cc b/paddle/fluid/platform/dynload/murand.cc new file mode 100644 index 0000000000000..d1af076066117 --- /dev/null +++ b/paddle/fluid/platform/dynload/murand.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/murand.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/murand.h b/paddle/fluid/platform/dynload/murand.h new file mode 100644 index 0000000000000..cf8ecf51595e0 --- /dev/null +++ b/paddle/fluid/platform/dynload/murand.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/murand.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MURAND_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +#define MURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(murandCreateGenerator); \ + __macro(murandSetStream); \ + __macro(murandSetPseudoRandomGeneratorSeed); \ + __macro(murandGenerateUniform); \ + __macro(murandGenerateUniformDouble); \ + __macro(murandGenerateNormal); \ + __macro(murandDestroyGenerator); + +MURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MURAND_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musa_driver.cc b/paddle/fluid/platform/dynload/musa_driver.cc new file mode 100644 index 0000000000000..8898bd4dfb654 --- /dev/null +++ b/paddle/fluid/platform/dynload/musa_driver.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/musa_driver.h" + +#include "paddle/phi/backends/dynload/musa_driver.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUSA_ROUTINE_EACH(DEFINE_WRAP); + +bool HasCUDADriver() { return phi::dynload::HasCUDADriver(); } + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h new file mode 100644 index 0000000000000..261841e8e7384 --- /dev/null +++ b/paddle/fluid/platform/dynload/musa_driver.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/musa_driver.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern bool HasCUDADriver(); + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +/** + * include all needed musa driver functions + **/ +#define PLATFORM_MUSA_ROUTINE_EACH(__macro) \ + __macro(muInit); \ + __macro(muDriverGetVersion); \ + __macro(muGetErrorString); \ + __macro(muModuleLoadData); \ + __macro(muModuleGetFunction); \ + __macro(muModuleUnload); \ + __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \ + __macro(muLaunchKernel); \ + __macro(muCtxCreate); \ + __macro(muCtxGetCurrent); \ + __macro(muDeviceGetCount); \ + __macro(muDevicePrimaryCtxGetState); \ + __macro(muDeviceGetAttribute); \ + __macro(muDeviceGet) + +PLATFORM_MUSA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP); + +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musartc.cc b/paddle/fluid/platform/dynload/musartc.cc new file mode 100644 index 0000000000000..4e15dab9c1359 --- /dev/null +++ b/paddle/fluid/platform/dynload/musartc.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/musartc.h" + +#include "paddle/phi/backends/dynload/musartc.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUSARTC_ROUTINE_EACH(DEFINE_WRAP); + +bool HasNVRTC() { return phi::dynload::HasNVRTC(); } + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musartc.h b/paddle/fluid/platform/dynload/musartc.h new file mode 100644 index 0000000000000..c383c85d7ab04 --- /dev/null +++ b/paddle/fluid/platform/dynload/musartc.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/musartc.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern bool HasNVRTC(); + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +/** + * include all needed musartc functions + **/ +#define MUSARTC_ROUTINE_EACH(__macro) \ + __macro(mtrtcVersion); \ + __macro(mtrtcGetErrorString); \ + __macro(mtrtcCompileProgram); \ + __macro(mtrtcCreateProgram); \ + __macro(mtrtcDestroyProgram); \ + __macro(mtrtcGetMUSA); \ + __macro(mtrtcGetMUSASize); \ + __macro(mtrtcGetProgramLog); \ + __macro(mtrtcGetProgramLogSize) + +MUSARTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); + +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musparse.cc b/paddle/fluid/platform/dynload/musparse.cc new file mode 100644 index 0000000000000..b0e8dbb58d569 --- /dev/null +++ b/paddle/fluid/platform/dynload/musparse.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/musparse.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +#ifdef MUSPARSE_ROUTINE_EACH +MUSPARSE_ROUTINE_EACH(DEFINE_WRAP); +#endif + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/musparse.h b/paddle/fluid/platform/dynload/musparse.h new file mode 100644 index 0000000000000..758c39104433e --- /dev/null +++ b/paddle/fluid/platform/dynload/musparse.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/musparse.h" + +namespace paddle { +namespace platform { +namespace dynload { + +#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name) \ + using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ + extern DynLoad__##__name __name + +#if defined(PADDLE_WITH_MUSA) +#define MUSPARSE_ROUTINE_EACH(__macro) \ + __macro(musparseSetStream); \ + __macro(musparseCreateMatDescr); \ + __macro(musparseSnnz); \ + __macro(musparseDnnz); \ + __macro(musparseSetMatType); \ + __macro(musparseSetMatIndexBase); \ + __macro(musparseCreateCsr); \ + __macro(musparseCreateCoo); \ + __macro(musparseCreateDnMat); \ + __macro(musparseCreateDnVec); \ + __macro(musparseSpMM); \ + __macro(musparseDestroySpMat); \ + __macro(musparseDestroyDnMat); \ + __macro(musparseDestroyDnVec); \ + __macro(musparseSpMV); \ + __macro(musparseSDDMM_bufferSize); \ + __macro(musparseSDDMM_preprocess); \ + __macro(musparseSDDMM); \ + __macro(musparseDnMatSetStridedBatch); \ + __macro(musparseCooSetStridedBatch); \ + __macro(musparseCsrSetStridedBatch); + +MUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP) +#endif // PADDLE_WITH_MUSA + +#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 425d4939b565f..ff33ea379d20c 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -38,6 +38,16 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#include +#include +#include +#include +#include +#include +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include #include @@ -98,7 +108,8 @@ limitations under the License. */ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/phi/core/enforce.h" // Note: this header for simplify HIP and CUDA type string -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_types.h" #endif #include "paddle/phi/core/flags.h" diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 9fc200ca82f1c..a5fc3786323c6 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -345,7 +345,8 @@ TEST(EOF_EXCEPTION, THROW_EOF) { EXPECT_TRUE(caught_eof); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") { PADDLE_ENFORCE_GPU_SUCCESS(value); @@ -395,6 +396,54 @@ TEST(enforce, hip_success) { EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Rccl error")); #endif } +#elif defined(PADDLE_WITH_MUSA) +TEST(enforce, musa_success) { + EXPECT_TRUE(CheckCudaStatusSuccess(musaSuccess)); + EXPECT_TRUE(CheckCudaStatusFailure(musaErrorInvalidValue, "MUSA error")); + + EXPECT_TRUE(CheckCudaStatusFailure(musaErrorMemoryAllocation, "MUSA error")); + + EXPECT_TRUE(CheckCudaStatusFailure( + musaErrorInsufficientDriver, + "This indicates that the installed MooreThreads MUSA driver is older " + "than the " + "MUSA runtime library. This is not a supported configuration.Users " + "should install an updated MooreThreads display driver to allow the " + "application to run")); + EXPECT_TRUE(CheckCudaStatusFailure( + musaErrorContextIsDestroyed, + "This error indicates that the context current to the calling thread has " + "been destroyed using muCtxDestroy, or is a primary context which has " + "not yet been initialized")); + + EXPECT_TRUE(CheckCudaStatusSuccess(MURAND_STATUS_SUCCESS)); + EXPECT_TRUE( + CheckCudaStatusFailure(MURAND_STATUS_VERSION_MISMATCH, "MURAND error")); + EXPECT_TRUE( + CheckCudaStatusFailure(MURAND_STATUS_NOT_CREATED, "MURAND error")); + EXPECT_TRUE( + CheckCudaStatusFailure(MURAND_STATUS_LENGTH_NOT_MULTIPLE, + "Length requested is not a multple of dimension")); + + EXPECT_TRUE(CheckCudaStatusSuccess(MUBLAS_STATUS_SUCCESS)); + EXPECT_TRUE( + CheckCudaStatusFailure(MUBLAS_STATUS_NOT_IMPLEMENTED, "MUBLAS error")); + EXPECT_TRUE( + CheckCudaStatusFailure(MUBLAS_STATUS_INVALID_VALUE, "MUBLAS error")); + +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) + EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess)); + EXPECT_TRUE(CheckCudaStatusFailure(mcclUnhandledMusaError, "MCCL error")); + EXPECT_TRUE(CheckCudaStatusFailure(mcclSystemError, "MCCL error")); + EXPECT_TRUE(CheckCudaStatusFailure(mcclInternalError, + "An internal check failed. This is either " + "a bug in MCCL or due to memory " + "corruption")); + EXPECT_TRUE(CheckCudaStatusFailure(mcclInvalidUsage, + "The call to MCCL is incorrect. This is " + "usually reflecting a programming error")); +#endif +} #else TEST(enforce, cuda_success) { EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess)); diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index e807a54fdee2d..e1a40cb8f7f64 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -21,6 +21,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index b5f31fd85847c..bce0890daecf9 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -18,7 +18,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/backends/cpu/cpu_info.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -172,7 +173,8 @@ void InitDevices() { #endif /*Init all available devices by default */ std::vector devices; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) try { // use user specified GPUs in single-node multi-process mode. devices = platform::GetSelectedDevices(); @@ -215,7 +217,8 @@ void InitDevices(const std::vector devices) { continue; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) places.emplace_back(platform::CUDAPlace(devices[i])); #endif #ifdef PADDLE_WITH_XPU @@ -226,7 +229,8 @@ void InitDevices(const std::vector devices) { #endif } places.emplace_back(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) places.emplace_back(platform::CUDAPinnedPlace()); #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -441,14 +445,15 @@ void InitMemoryMethod() { memory_method->allocation_deleter = paddle::memory::allocation::Allocator::AllocationDeleter; #if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \ - defined(PADDLE_WITH_HIP) + defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) memory_method->copy_with_stream = paddle::memory::Copy; #endif memory_method->copy = paddle::memory::Copy; memory_method->device_memory_stat_current_value = paddle::memory::DeviceMemoryStatCurrentValue; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage; #endif memory_method->emplace_device_contexts = diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc index 66fb431af29e9..3cb6ea34bdaff 100644 --- a/paddle/fluid/platform/init_test.cc +++ b/paddle/fluid/platform/init_test.cc @@ -32,7 +32,8 @@ TEST(InitDevices, CUDA) { using paddle::framework::InitDevices; using paddle::platform::DeviceContextPool; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) int count = paddle::platform::GetGPUDeviceCount(); InitDevices(); DeviceContextPool& pool = DeviceContextPool::Instance(); diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 959379260419d..d2c1a448b633c 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -57,7 +57,8 @@ typename Visitor::result_type VisitPlace(const Place &place, const Visitor &visitor) { switch (place.GetType()) { case phi::AllocationType::GPU: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) platform::CUDAPlace p(place.GetDeviceId()); return visitor(p); #else @@ -67,7 +68,8 @@ typename Visitor::result_type VisitPlace(const Place &place, #endif } case phi::AllocationType::GPUPINNED: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) platform::CUDAPinnedPlace p; return visitor(p); #else diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 2c65023988dc6..979219fb1920b 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -698,7 +698,8 @@ void EnableProfiler(ProfilerState state) { HostTraceLevel::GetInstance().SetLevel(option.trace_level); should_send_profile_state = true; phi::GetDeviceTracer()->Enable(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA || phi::ProfilerHelper::g_state == ProfilerState::kAll || phi::ProfilerHelper::g_state == ProfilerState::kCPU) { diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu index 5d1caffd45326..d2fea0336f012 100644 --- a/paddle/fluid/platform/profiler.cu +++ b/paddle/fluid/platform/profiler.cu @@ -16,6 +16,10 @@ limitations under the License. */ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif @@ -52,6 +56,20 @@ void DummyKernelAndEvent() { PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr)); }); } +#elif defined(PADDLE_WITH_MUSA) + for (int i = 0; i < 5; i++) { + ForEachDevice([](int d) { + platform::SetDeviceId(d); + musaStream_t stream; + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream)); + Mark("_cuda_startup_"); + int *ptr; + PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&ptr, sizeof(int))); + DummyKernel<<<1, 1, 0, stream>>>(ptr); + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr)); + }); + } #else for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index c71b5a0e49104..607961ceebda3 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -31,7 +31,8 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/mem_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -197,7 +198,8 @@ std::string OpName(const framework::VariableNameMap& name_map, const std::string& type_name); void SetTracerOption(TracerOption option); platform::TracerOption GetTracerOption(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void DummyKernelAndEvent(); #endif diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index e3fe83c5a74d2..4bd2be19c15bd 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -561,7 +561,8 @@ void ChromeTracingLogger::LogMetaInfo(const std::string& version, span_indx); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void ChromeTracingLogger::LogDeviceProperty( const std::map& device_property_map) { // add device property information diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h index 7f9bec1c32a53..6ad4883b89944 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.h +++ b/paddle/fluid/platform/profiler/chrometracing_logger.h @@ -40,7 +40,8 @@ class ChromeTracingLogger : public BaseLogger { void LogNodeTrees(const NodeTrees&) override; void LogExtraInfo(const std::unordered_map); void LogMemTraceEventNode(const MemTraceEventNode&) override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void LogDeviceProperty( const std::map& device_property_map); #endif diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index 1d0970235a128..f73423d84a69b 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -129,7 +129,8 @@ std::unique_ptr DeserializationReader::Parse() { // restore NodeTrees object std::unique_ptr tree(new NodeTrees(thread_event_trees_map)); // restore gpuDeviceProp -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::map device_property_map; for (auto indx = 0; indx < node_trees_proto_->device_property_size(); indx++) { @@ -155,7 +156,8 @@ DeserializationReader::~DeserializationReader() { input_file_stream_.close(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gpuDeviceProp DeserializationReader::RestoreDeviceProperty( const DevicePropertyProto& device_property_proto) { gpuDeviceProp device_property; diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h index 5f99f6fd82c55..8f3f1766e126b 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -39,7 +39,8 @@ class DeserializationReader { MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&); OperatorSupplementEventNode* RestoreOperatorSupplementEventNode( const OperatorSupplementEventNodeProto&); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&); #endif diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index be1e1c01f8b52..9fce9e3eeecf8 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -40,7 +40,8 @@ void SerializationLogger::OpenFile() { node_trees_proto_ = new NodeTreesProto(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void SerializationLogger::LogDeviceProperty( const std::map& device_property_map) { for (auto it = device_property_map.begin(); it != device_property_map.end(); diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h index 80d5413106ded..6ff84150436c7 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.h +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -37,7 +37,8 @@ class SerializationLogger : public BaseLogger { void LogNodeTrees(const NodeTrees&) override; void LogExtraInfo(const std::unordered_map); void LogMemTraceEventNode(const MemTraceEventNode&) override; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void LogDeviceProperty( const std::map& device_property_map); #endif diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc index eaea4f3850fef..14d81876233fd 100644 --- a/paddle/fluid/platform/profiler/event_python.cc +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -137,7 +137,8 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { return host_python_node; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) ProfilerResult::ProfilerResult( std::unique_ptr tree, const ExtraInfo& extra_info, @@ -179,7 +180,8 @@ void ProfilerResult::Save(const std::string& file_name, if (format == std::string("json")) { ChromeTracingLogger logger(file_name); logger.LogMetaInfo(version_, span_indx_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) logger.LogDeviceProperty(device_property_map_); #endif tree_->LogMe(&logger); @@ -187,7 +189,8 @@ void ProfilerResult::Save(const std::string& file_name, } else if (format == std::string("pb")) { SerializationLogger logger(file_name); logger.LogMetaInfo(version_, span_indx_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) logger.LogDeviceProperty(device_property_map_); #endif tree_->LogMe(&logger); diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index dae32a1902834..964fcc4c19050 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -138,7 +138,8 @@ struct HostPythonNode { class ProfilerResult { public: ProfilerResult() : tree_(nullptr) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) explicit ProfilerResult( std::unique_ptr tree, const ExtraInfo& extra_info, @@ -166,7 +167,8 @@ class ProfilerResult { std::string GetVersion() { return version_; } uint32_t GetSpanIndx() { return span_indx_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::map GetDeviceProperty() { return device_property_map_; } @@ -176,7 +178,8 @@ class ProfilerResult { std::map thread_event_trees_map_; std::shared_ptr tree_; ExtraInfo extra_info_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::map device_property_map_; #endif std::string version_; diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index e0a91629a10d6..76a1b347a363f 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -18,10 +18,14 @@ #ifdef PADDLE_WITH_CUDA #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif #ifdef PADDLE_WITH_HIP #include #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #include "paddle/fluid/platform/enforce.h" @@ -43,6 +47,9 @@ void SynchronizeDevice() { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #endif +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); +#endif #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif @@ -161,7 +168,8 @@ std::unique_ptr Profiler::Stop() { std::string("%s"), kv.second.c_str()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::map device_property_map; std::vector device_ids = GetSelectedDevices(); for (auto index = 0u; index < device_ids.size(); index++) { diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc index 7fb25b25577c4..a4fb29b86f43f 100644 --- a/paddle/fluid/platform/profiler/utils.cc +++ b/paddle/fluid/platform/profiler/utils.cc @@ -93,6 +93,8 @@ float CalculateEstOccupancy(uint32_t DeviceId, return occupancy; } +#elif defined(PADDLE_WITH_MUSA) + #else float CalculateEstOccupancy(uint32_t DeviceId, diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h index c9437e0e7793a..5adaadf87d288 100644 --- a/paddle/fluid/platform/profiler/utils.h +++ b/paddle/fluid/platform/profiler/utils.h @@ -133,6 +133,8 @@ float CalculateEstOccupancy(uint32_t DeviceId, int32_t BlockZ, void* kernelFunc, uint8_t launchType); +#elif defined(PADDLE_WITH_MUSA) + #else float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread, diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 1d34d5fd27b3e..2e00826744091 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -31,6 +31,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#endif // PADDLE_WITH_MUSA #ifdef PADDLE_WITH_HIP #include #endif @@ -103,6 +106,15 @@ void SynchronizeAllDevice() { } SetDeviceId(pre_device_id); #endif +#ifdef PADDLE_WITH_MUSA + int pre_device_id = GetCurrentDeviceId(); + int count = GetGPUDeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); + } + SetDeviceId(pre_device_id); +#endif #ifdef PADDLE_WITH_HIP int pre_device_id = GetCurrentDeviceId(); int count = GetGPUDeviceCount(); @@ -142,7 +154,8 @@ void PrintMemProfiler( << " Memory Profiling Report " << "<-------------------------\n\n"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) int num_gpus = GetGPUDeviceCount(); std::cout.setf(std::ios::left); if (num_gpus > 0) { @@ -344,7 +357,8 @@ void SetEvent(bool merge_thread, if (rit != pushed_events->rend()) { double event_time = 0; double gpu_time = 0.0f; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gpu_time = rit->CudaElapsedMs(analyze_event); #endif double cpu_time = rit->CpuElapsedMs(analyze_event); diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index 0e1c681288fe1..af59782d5c926 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -122,7 +122,8 @@ TEST(RecordEvent, RecordEvent) { if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; if (events[i][j].name() == "push") { EXPECT_EQ(events[i][j + 1].name(), "pop"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0); #else EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0); diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index c55bcb71a7d43..69a1d2c575421 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -24,11 +24,14 @@ static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status, void *user_data) #endif +#ifdef PADDLE_WITH_MUSA + static void MUSART_CB StreamCallbackFunc(void *user_data) +#endif #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 - static void CUDART_CB StreamCallbackFunc(void *user_data) + static void CUDART_CB StreamCallbackFunc(void *user_data) #else - static void CUDART_CB + static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data) #endif #endif @@ -58,6 +61,10 @@ void StreamCallbackManager::AddCallback( PADDLE_ENFORCE_GPU_SUCCESS( hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); #endif +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS( + musaLaunchHostFunc(stream_, StreamCallbackFunc, func)); +#endif #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 PADDLE_ENFORCE_GPU_SUCCESS( @@ -71,7 +78,8 @@ void StreamCallbackManager::AddCallback( template void StreamCallbackManager::Wait() const { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || \ + defined(PADDLE_WITH_MUSA) platform::GpuStreamSync(stream_); #endif { @@ -85,6 +93,9 @@ void StreamCallbackManager::Wait() const { #ifdef PADDLE_WITH_CUDA template struct StreamCallbackManager; #endif +#ifdef PADDLE_WITH_MUSA +template struct StreamCallbackManager; +#endif #ifdef PADDLE_WITH_HIP template struct StreamCallbackManager; #endif diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 7cd6930a9d0d0..10b0a1aded0d9 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -21,6 +21,11 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif + #ifdef PADDLE_WITH_HIP #include #endif diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc index 2b8969e1b8181..b320f96839e4c 100644 --- a/paddle/fluid/pybind/cuda_streams_py.cc +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -24,7 +24,8 @@ namespace py = pybind11; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::CUDAStream *get_current_stream(int device_id) { if (device_id == -1) { device_id = phi::backends::gpu::GetCurrentDeviceId(); @@ -51,7 +52,8 @@ void BindCudaStream(py::module *m_ptr) { m.def( "_get_current_stream", [](int deviceId) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) return platform::get_current_stream(deviceId); #else PADDLE_THROW( @@ -64,7 +66,8 @@ void BindCudaStream(py::module *m_ptr) { m.def( "_set_current_stream", [](phi::CUDAStream *stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) return platform::set_current_stream(stream); #else PADDLE_THROW( @@ -75,7 +78,8 @@ void BindCudaStream(py::module *m_ptr) { py::return_value_policy::reference); m.def("_device_synchronize", [](int device_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (device_id == -1) { device_id = paddle::platform::GetCurrentDeviceId(); } @@ -84,6 +88,8 @@ void BindCudaStream(py::module *m_ptr) { paddle::platform::SetDeviceId(device_id); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #endif @@ -115,7 +121,8 @@ void BindCudaStream(py::module *m_ptr) { s3 = paddle.device.cuda.Stream() )DOC") -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) .def( "wait_event", [](phi::CUDAStream &self, paddle::platform::CudaEvent &event) { @@ -251,7 +258,8 @@ void BindCudaStream(py::module *m_ptr) { .def( "__init__", [](phi::CUDAStream &self, platform::CUDAPlace *place, int priority) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (priority != 1 && priority != 2) { PADDLE_THROW(platform::errors::InvalidArgument( "Priority should be 1(high) or 2(normal) ")); @@ -277,7 +285,8 @@ void BindCudaStream(py::module *m_ptr) { .def( "__init__", [](phi::CUDAStream &self, int device, int priority) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (priority != 1 && priority != 2) { PADDLE_THROW(platform::errors::InvalidArgument( "Priority should be 1(high) or 2(normal) ")); @@ -307,7 +316,8 @@ void BindCudaStream(py::module *m_ptr) { py::arg("device") = -1, py::arg("priority") = 2) .def("__init__", [](phi::CUDAStream &self) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) int device_id = platform::GetCurrentDeviceId(); auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking; new (&self) phi::CUDAStream( @@ -334,7 +344,8 @@ void BindCudaStream(py::module *m_ptr) { event = paddle.device.cuda.Event() )DOC") -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) .def( "record", [](paddle::platform::CudaEvent &self, phi::CUDAStream *stream) { @@ -398,7 +409,8 @@ void BindCudaStream(py::module *m_ptr) { bool enable_timing, bool blocking, bool interprocess) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) unsigned int flags = platform::GenerateDeviceEventFlag( enable_timing, blocking, interprocess); new (&self) paddle::platform::CudaEvent(flags); diff --git a/paddle/fluid/pybind/cuda_streams_py.h b/paddle/fluid/pybind/cuda_streams_py.h index d10608a6e8ea9..41e62fd92aefb 100644 --- a/paddle/fluid/pybind/cuda_streams_py.h +++ b/paddle/fluid/pybind/cuda_streams_py.h @@ -17,7 +17,8 @@ #include "pybind11/pybind11.h" #include "pybind11/stl.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #else namespace phi { @@ -29,7 +30,8 @@ namespace py = pybind11; namespace paddle { namespace platform { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::CUDAStream* get_current_stream(int device_id = -1); phi::CUDAStream* set_current_stream(phi::CUDAStream* stream); #endif diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 59a94a31c448d..d560e11da0674 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -58,7 +58,8 @@ typedef SSIZE_T ssize_t; #include "pybind11/numpy.h" #include "pybind11/pybind11.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/pybind/cuda_streams_py.h" #endif diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc index 69d0465bf7cdd..72a1df8e0ace9 100644 --- a/paddle/fluid/pybind/eager_math_op_patch.cc +++ b/paddle/fluid/pybind/eager_math_op_patch.cc @@ -138,7 +138,8 @@ std::set _complex_dtypes{ void SetDevice(paddle::platform::Place place) { if (paddle::platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::backends::gpu::SetDeviceId(place.device); VLOG(6) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << static_cast(place.device); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index eb0e895cf575c..a911d593f76c1 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -223,13 +223,16 @@ static PyObject* tensor_method_numpy(TensorObject* self, sizeof_dtype * numel); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } else if (self->tensor.is_gpu()) { eager_gil_scoped_release guard; #if defined(PADDLE_WITH_CUDA) gpuMemcpyKind kind = cudaMemcpyDeviceToHost; #elif defined(PADDLE_WITH_HIP) gpuMemcpyKind kind = hipMemcpyDeviceToHost; +#elif defined(PADDLE_WITH_MUSA) + gpuMemcpyKind kind = musaMemcpyDeviceToHost; #endif if (self->tensor.is_selected_rows()) { VLOG(6) << "Getting SelectedRows's numpy value"; @@ -1338,7 +1341,8 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, self_numpy[_index] = py::object(py::handle(value_obj), true); } if (!self->tensor.initialized()) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) SetTensorFromPyArray(self_tensor, self_numpy, platform::Place(platform::CUDAPlace(0)), diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc index 99621b1463ea9..598272ee09aff 100644 --- a/paddle/fluid/pybind/generator_py.cc +++ b/paddle/fluid/pybind/generator_py.cc @@ -40,7 +40,8 @@ void BindGenerator(py::module* m_ptr) { [](std::shared_ptr& self) { return self->current_seed; }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // NOTE(shenliang03): Due to the inability to serialize mt19937_64 // type, resulting in a problem with precision under the cpu. .def(py::pickle( diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index bdf54bd76b6e1..f27a7adc62a07 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -43,7 +43,8 @@ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/core/compat/convert_utils.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #endif @@ -658,7 +659,8 @@ void BindPaddlePredictor(py::module *m) { .def("get_output_names", &PaddlePredictor::GetOutputNames) .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun) .def("clone", [](PaddlePredictor &self) { return self.Clone(nullptr); }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) .def("clone", [](PaddlePredictor &self, phi::CUDAStream &stream) { return self.Clone(stream.raw_stream()); @@ -705,7 +707,8 @@ void BindNativePredictor(py::module *m) { .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun) .def("clone", [](NativePaddlePredictor &self) { return self.Clone(nullptr); }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) .def("clone", [](NativePaddlePredictor &self, phi::CUDAStream &stream) { return self.Clone(stream.raw_stream()); @@ -750,7 +753,8 @@ void BindAnalysisConfig(py::module *m) { .def("exp_enable_use_cutlass", &AnalysisConfig::Exp_EnableUseCutlass) .def("exp_disable_mixed_precision_ops", &AnalysisConfig::Exp_DisableMixedPrecisionOps) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) .def("set_exec_stream", [](AnalysisConfig &self, phi::CUDAStream &stream) { self.SetExecStream(stream.raw_stream()); @@ -1084,7 +1088,8 @@ void BindAnalysisPredictor(py::module *m) { &AnalysisPredictor::analysis_argument, py::return_value_policy::reference) .def("clone", [](AnalysisPredictor &self) { return self.Clone(nullptr); }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) .def("clone", [](AnalysisPredictor &self, phi::CUDAStream &stream) { return self.Clone(stream.raw_stream()); @@ -1122,7 +1127,8 @@ void BindPaddleInferPredictor(py::module *m) { .def("run", [](paddle_infer::Predictor &self) { self.Run(); }) .def("clone", [](paddle_infer::Predictor &self) { return self.Clone(nullptr); }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) .def("clone", [](paddle_infer::Predictor &self, phi::CUDAStream &stream) { return self.Clone(stream.raw_stream()); diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc index 9ba115381a2c0..aee4dd8b07a04 100644 --- a/paddle/fluid/pybind/parallel_executor.cc +++ b/paddle/fluid/pybind/parallel_executor.cc @@ -126,11 +126,12 @@ limitations under the License. */ #include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif -#ifndef PADDLE_WITH_HIP +#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc index c97bba9be8f2f..6c76c61542528 100644 --- a/paddle/fluid/pybind/place.cc +++ b/paddle/fluid/pybind/place.cc @@ -126,11 +126,12 @@ limitations under the License. */ #include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif -#ifndef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -318,7 +319,8 @@ void BindPlace(pybind11::module &m) { // NOLINT cudaplace .def("__init__", [](platform::CUDAPlace &self, int dev_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (UNLIKELY(dev_id < 0)) { LOG(ERROR) << string::Sprintf( "Invalid CUDAPlace(%d), device id must be 0 or " @@ -357,7 +359,8 @@ void BindPlace(pybind11::module &m) { // NOLINT std::exit(-1); #endif }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) .def("get_device_id", [](const platform::CUDAPlace &self) { return self.GetDeviceId(); }) .def("_type", &PlaceIndex) @@ -372,10 +375,11 @@ void BindPlace(pybind11::module &m) { // NOLINT #endif .def("__repr__", string::to_string) .def("__str__", string::to_string); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 53 support float16 -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return true; #else return platform::GetGPUComputeCapability(place.device) >= 53; @@ -383,7 +387,7 @@ void BindPlace(pybind11::module &m) { // NOLINT }); m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 80 support bfloat16 -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) return false; #else return platform::GetGPUComputeCapability(place.device) >= 80; @@ -540,7 +544,8 @@ void BindPlace(pybind11::module &m) { // NOLINT cudapinnedplace .def("__init__", [](platform::CUDAPinnedPlace &self) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \ + !defined(PADDLE_WITH_MUSA) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPinnedPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h index 1a6b640b3a3cf..85fde515754a5 100644 --- a/paddle/fluid/pybind/process_group_utils.h +++ b/paddle/fluid/pybind/process_group_utils.h @@ -250,7 +250,8 @@ void ConcatTensor(const phi::DeviceContext &dev_ctx, const auto &place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) ConcatDenseTensorWithType(static_cast(dev_ctx), tensor_list, dense_tensor, @@ -307,7 +308,8 @@ void SplitTensor(const phi::DeviceContext &dev_ctx, const auto &place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) SplitDenseTensorWithType(static_cast(dev_ctx), tensor, &dense_list, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 60ade1f9875fd..f63330c76a5fe 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -144,11 +144,12 @@ limitations under the License. */ #include "paddle/fluid/pybind/tensor.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif -#ifndef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -231,7 +232,8 @@ bool IsCompiledWithAVX() { } bool IsCompiledWithCUDA() { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \ + !defined(PADDLE_WITH_MUSA) return false; #else return true; @@ -776,7 +778,8 @@ PYBIND11_MODULE(libpaddle, m) { } }); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) m.def("cudnn_version", &platform::DnnVersion); m.def("gpu_memory_available", []() { size_t available = 0; @@ -828,7 +831,8 @@ PYBIND11_MODULE(libpaddle, m) { if (dl.device.device_type == kDLCPU) { paddle::framework::TensorFromDLPack(dmt, &tensor); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (dl.device.device_type == kDLGPU) { paddle::framework::TensorFromDLPack(dmt, &tensor); } @@ -1563,7 +1567,8 @@ All parameter, weight, gradient are variables in Paddle. "create", [](paddle::platform::CUDAPlace &place) -> paddle::platform::DeviceContext * { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \ + !defined(PADDLE_WITH_MUSA) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); @@ -1597,7 +1602,8 @@ All parameter, weight, gradient are variables in Paddle. "create", [](paddle::platform::CUDAPinnedPlace &place) -> paddle::platform::DeviceContext * { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \ + !defined(PADDLE_WITH_MUSA) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPinnedPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); @@ -2199,7 +2205,8 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::take_ownership); m.def("op_support_gpu", OpSupportGPU); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) m.def("get_cuda_device_count", platform::GetGPUDeviceCount); m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId); m.def("cuda_empty_cache", [] { @@ -2245,7 +2252,7 @@ All parameter, weight, gradient are variables in Paddle. return ostr.str(); }); -#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32) +#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32) && !defined(PADDLE_WITH_MUSA) m.def("nvprof_init", platform::CudaProfilerInit); m.def("nvprof_start", platform::CudaProfilerStart); m.def("nvprof_stop", platform::CudaProfilerStop); @@ -2320,7 +2327,8 @@ All parameter, weight, gradient are variables in Paddle. .def("save", &paddle::platform::ProfilerResult::Save) .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo) .def("get_version", &paddle::platform::ProfilerResult::GetVersion) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx) .def("get_device_property", &paddle::platform::ProfilerResult::GetDeviceProperty); @@ -2477,7 +2485,8 @@ All parameter, weight, gradient are variables in Paddle. m.def("enable_op_info_recorder", &phi::EnableOpInfoRecorder); m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) m.def("set_cublas_switch", phi::SetAllowTF32Cublas); m.def("get_cublas_switch", phi::AllowTF32Cublas); m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn); diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index 98ae45dd0134b..cee763c6530f1 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -126,11 +126,12 @@ limitations under the License. */ #include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif -#ifndef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index ba33fcd1d129f..c0eaa9dc3d524 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -37,7 +37,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/pybind/complex.h" #include "paddle/phi/kernels/funcs/strided_memcpy.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" @@ -325,7 +326,8 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) { #endif } else if (platform::is_gpu_place(self.place()) || platform::is_cuda_pinned_place(self.place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) const T *a = self.data(); auto p = self.place(); paddle::memory::Copy( @@ -362,7 +364,8 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) { #endif } else if (platform::is_gpu_place(self->place()) || platform::is_cuda_pinned_place(self->place())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto p = self->place(); T *a = self->mutable_data(p); paddle::memory::Copy( @@ -457,7 +460,8 @@ void SetTensorFromPyArrayT( "Please recompile or reinstall Paddle with CustomDevice support.")); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (paddle::platform::is_gpu_place(place)) { // NOTE(wangxi): When copying data to the accelerator card, // we need set_device(dev_id) first. @@ -466,6 +470,9 @@ void SetTensorFromPyArrayT( #ifdef PADDLE_WITH_HIP paddle::platform::GpuMemcpySync( dst, array.data(), array.nbytes(), hipMemcpyHostToDevice); +#elif defined(PADDLE_WITH_MUSA) + paddle::platform::GpuMemcpySync( + dst, array.data(), array.nbytes(), musaMemcpyHostToDevice); #else paddle::platform::GpuMemcpySync( dst, array.data(), array.nbytes(), cudaMemcpyHostToDevice); @@ -790,7 +797,8 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self, output->mutable_data(place, self.dtype()); #endif } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (platform::is_cuda_pinned_place(place)) { output->mutable_data(place, self.dtype()); } else if ((platform::is_gpu_place(place))) { @@ -1039,7 +1047,8 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, "Please recompile or reinstall Paddle with XPU support.")); #endif } else if (is_gpu_tensor) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); PADDLE_ENFORCE_EQ(py_arr.writeable(), true, diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 1ed3fac122826..593109d3e8e27 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -123,6 +123,9 @@ if(WITH_GPU) elseif(WITH_ROCM) hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS}) target_link_libraries(phi ${PHI_DEPS}) +elseif(WITH_MUSA) + musa_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS}) + target_link_libraries(phi ${PHI_DEPS}) elseif(WITH_XPU_KP) xpu_library( phi ${PHI_BUILD_TYPE} diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h index 7afe17ba8419d..b694dc8013a30 100644 --- a/paddle/phi/api/include/context_pool.h +++ b/paddle/phi/api/include/context_pool.h @@ -97,7 +97,8 @@ namespace paddle { */ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) /** * Get the current CUDA stream for the passed CUDA device. */ diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index b626df6c6701c..30f087d22c559 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -29,6 +29,11 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif +#ifdef PADDLE_WITH_MUSA +#include +using gpuStream_t = musaStream_t; +#endif + #include "paddle/phi/api/include/dll_decl.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/int_array.h" @@ -396,7 +401,8 @@ class PADDLE_API Tensor final { */ void set_impl(std::shared_ptr&& impl); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) /** * @brief Get the stream where the tensor is currently located * This is a deprecated method and may be removed in the future! diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc index 292bd8a7e47aa..f958ea2a96039 100644 --- a/paddle/phi/api/lib/context_pool.cc +++ b/paddle/phi/api/lib/context_pool.cc @@ -19,7 +19,8 @@ limitations under the License. */ #include "paddle/phi/core/allocator.h" #include "paddle/phi/core/enforce.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #endif @@ -63,7 +64,8 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) { return const_cast(&dev_ctx->GetAllocator()); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) { PADDLE_ENFORCE_EQ(place.GetType(), phi::AllocationType::GPU, diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index f9316965be26b..12c13cba89fb0 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -93,7 +93,8 @@ phi::DenseTensor CastDataType(const Context& dev_ctx, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx, const phi::DenseTensor& tensor, DataType dtype) { @@ -135,7 +136,8 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor, if (tensor.place().GetType() == phi::AllocationType::CPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return CastDataType(*dev_ctx, tensor, dtype); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } else if (tensor.place().GetType() == phi::AllocationType::GPU) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); return CastDataType(*dev_ctx, tensor, dtype); @@ -153,7 +155,8 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor, << " dst_place: " << dst_place; auto& pool = phi::DeviceContextPool::Instance(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // NOTE(yy): TransDataPlace should wait for computation of input. if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) { pool.Get(tensor.place())->Wait(); diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index e8caf52530868..a11dbf445ab9b 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -359,7 +359,8 @@ void Tensor::set_impl(std::shared_ptr &&impl) { impl_ = std::move(impl); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) gpuStream_t Tensor::stream() const { int device_id = phi::backends::gpu::GetCurrentDeviceId(); auto *gpu_context = DeviceContextPool::Instance().Get( diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc index b8d25e4f22b10..49e2a2698f4dd 100644 --- a/paddle/phi/api/lib/tensor_utils.cc +++ b/paddle/phi/api/lib/tensor_utils.cc @@ -17,9 +17,12 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_registry.h" #include "paddle/phi/core/dense_tensor.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include #else #include #endif @@ -30,7 +33,8 @@ namespace paddle { PD_REGISTER_API(from_blob) phi::Place GetPlaceFromPtr(void* data) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 cudaPointerAttributes attr; @@ -43,6 +47,12 @@ phi::Place GetPlaceFromPtr(void* data) { phi::errors::Unimplemented("The GetPlaceFromPtr() method is only " "supported when CUDA version >= 10.0.")); #endif +#elif defined(PADDLE_WITH_MUSA) + musaPointerAttributes attr; + musaError_t status = musaPointerGetAttributes(&attr, data); + if (status == musaSuccess && attr.type == musaMemoryTypeDevice) { + return phi::GPUPlace(attr.device); + } #else hipPointerAttribute_t attr; hipError_t status = hipPointerGetAttributes(&attr, data); diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h index b19f20485227b..eaf8afbe03a65 100644 --- a/paddle/phi/api/profiler/event.h +++ b/paddle/phi/api/profiler/event.h @@ -27,8 +27,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/phi/core/cuda_stream.h" #endif @@ -62,7 +66,8 @@ class Event { void set_name(std::string name) { name_ = name; } void set_role(EventRole role) { role_ = role; } std::string attr() const { return attr_; } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifndef PADDLE_WITH_CUPTI gpuEvent_t event() const { return event_; } int device() const { return device_; } @@ -81,7 +86,8 @@ class Event { int64_t cpu_ns_; bool visited_status_{false}; std::string attr_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUPTI int64_t gpu_ns_ = 0; @@ -137,12 +143,15 @@ class MemEvent { }; class CudaEvent { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) public: CudaEvent() { #ifdef PADDLE_WITH_HIP hipEventCreateWithFlags(&event_, flags_); +#elif defined(PADDLE_WITH_MUSA) + musaEventCreateWithFlags(&event_, flags_); #else cudaEventCreateWithFlags(&event_, flags_); #endif @@ -152,6 +161,8 @@ class CudaEvent { explicit CudaEvent(unsigned int flags) : flags_(flags) { #ifdef PADDLE_WITH_HIP hipEventCreateWithFlags(&event_, flags_); +#elif defined(PADDLE_WITH_MUSA) + musaEventCreateWithFlags(&event_, flags_); #else cudaEventCreateWithFlags(&event_, flags_); #endif @@ -161,6 +172,8 @@ class CudaEvent { ~CudaEvent() { #ifdef PADDLE_WITH_HIP hipEventDestroy(event_); +#elif defined(PADDLE_WITH_MUSA) + musaEventDestroy(event_); #else cudaEventDestroy(event_); #endif @@ -169,6 +182,8 @@ class CudaEvent { void Record(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream)); #endif @@ -183,6 +198,14 @@ class CudaEvent { if (err == hipErrorNotReady) { return false; } +#elif defined(PADDLE_WITH_MUSA) + gpuError_t err = musaEventQuery(event_); + if (err == musaSuccess) { + return true; + } + if (err == musaErrorNotReady) { + return false; + } #else gpuError_t err = cudaEventQuery(event_); if (err == cudaSuccess) { @@ -199,6 +222,8 @@ class CudaEvent { void Synchronize() { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventSynchronize(event_)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_)); #endif @@ -208,6 +233,8 @@ class CudaEvent { private: #ifdef PADDLE_WITH_HIP unsigned int flags_ = hipEventDefault; +#elif defined(PADDLE_WITH_MUSA) + unsigned int flags_ = musaEventDefault; #else unsigned int flags_ = cudaEventDefault; #endif diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 1c916682cf7b1..5e14c15e8cb26 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -7,7 +7,9 @@ if(NOT APPLE AND NOT WIN32) list(APPEND BACKENDS_SRCS device_code.cc) endif() -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU + OR WITH_ROCM + OR WITH_MUSA) list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc gpu/gpu_resources.cc) if(WITH_GPU) @@ -16,6 +18,9 @@ if(WITH_GPU OR WITH_ROCM) if(WITH_ROCM) list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc) endif() + if(WITH_MUSA) + list(APPEND BACKENDS_SRCS gpu/musa/musa_info.cc) + endif() endif() if(WITH_XPU) @@ -43,6 +48,7 @@ list( if(WITH_GPU OR WITH_ROCM + OR WITH_MUSA OR WITH_CUSTOM_DEVICE) list(APPEND BACKENDS_SRCS device_base.cc) endif() diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc index e295ac388d892..e3b28fb2c0871 100644 --- a/paddle/phi/backends/context_pool.cc +++ b/paddle/phi/backends/context_pool.cc @@ -21,7 +21,8 @@ limitations under the License. */ namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) bool allow_tf32_cublas = true; void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; } bool AllowTF32Cublas() { return allow_tf32_cublas; } diff --git a/paddle/phi/backends/context_pool.h b/paddle/phi/backends/context_pool.h index 6ff90e05fed4a..966f338b7337c 100644 --- a/paddle/phi/backends/context_pool.h +++ b/paddle/phi/backends/context_pool.h @@ -27,7 +27,8 @@ limitations under the License. */ namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void SetAllowTF32Cublas(bool active); /*Get the global variable allow_tf32_cublas value*/ bool AllowTF32Cublas(); @@ -46,7 +47,8 @@ struct DefaultDeviceContextType { using TYPE = phi::CPUContext; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template <> struct DefaultDeviceContextType { using TYPE = phi::GPUContext; diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc index eb2934d1b4842..ac16a69aa7bee 100644 --- a/paddle/phi/backends/device_code.cc +++ b/paddle/phi/backends/device_code.cc @@ -78,7 +78,8 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } for (auto& p : set) { if (p.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) device_codes_.emplace(p, DeviceCodeMap()); #else PADDLE_THROW(phi::errors::PreconditionNotMet( @@ -88,12 +89,14 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) GPUDeviceCode::CheckAvailableStatus(); #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP static bool CheckCUDADriverResult(hipError_t result, std::string caller, @@ -101,6 +104,13 @@ static bool CheckCUDADriverResult(hipError_t result, if (result != hipSuccess) { const char* error = nullptr; error = dynload::hipGetErrorString(result); +#elif defined(PADDLE_WITH_MUSA) +static bool CheckCUDADriverResult(MUresult result, + std::string caller, + std::string kernel_name = "") { + if (result != MUSA_SUCCESS) { + const char* error = nullptr; + dynload::muGetErrorString(result, &error); #else static bool CheckCUDADriverResult(CUresult result, std::string caller, @@ -130,6 +140,8 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP hiprtcResult nvrtc_result = dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor); +#elif defined(PADDLE_WITH_MUSA) + mtrtcResult nvrtc_result = dynload::mtrtcVersion(&nvrtc_major, &nvrtc_minor); #else nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor); #endif @@ -140,6 +152,9 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version); if (driver_result == hipSuccess) { +#elif defined(PADDLE_WITH_MUSA) + MUresult driver_result = dynload::muDriverGetVersion(&driver_version); + if (driver_result == MUSA_SUCCESS) { #else CUresult driver_result = dynload::cuDriverGetVersion(&driver_version); if (driver_result == CUDA_SUCCESS) { @@ -153,6 +168,8 @@ void GPUDeviceCode::CheckAvailableStatus() { << "." << nvrtc_minor; #ifdef PADDLE_WITH_HIP if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) { +#elif defined(PADDLE_WITH_MUSA) + if (nvrtc_result != MTRTC_SUCCESS || driver_result != MUSA_SUCCESS) { #else if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) { #endif @@ -163,6 +180,9 @@ void GPUDeviceCode::CheckAvailableStatus() { #ifdef PADDLE_WITH_HIP if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count), "hipGetDeviceCount")) { +#elif defined(PADDLE_WITH_MUSA) + if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count), + "muDeviceGetCount")) { #else if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count), "cuDeviceGetCount")) { @@ -202,6 +222,8 @@ static std::string FindCUDAIncludePath() { #ifdef PADDLE_WITH_HIP cuda_include_path = "/opt/rocm/include"; +#elif defined(PADDLE_WITH_MUSA) + cuda_include_path = "/usr/local/musa/include"; #else cuda_include_path = "/usr/local/cuda/include"; #endif @@ -229,6 +251,8 @@ GPUDeviceCode::GPUDeviceCode(const Place& place, name_ = name; #ifdef PADDLE_WITH_HIP kernel_ = "#include \n" + kernel; +#elif defined(PADDLE_WITH_MUSA) + kernel_ = kernel; #else kernel_ = kernel; #endif @@ -318,6 +342,86 @@ bool GPUDeviceCode::Compile(bool include_path) { "hipModuleGetFunction")) { return false; } +#elif defined(PADDLE_WITH_MUSA) + mtrtcProgram program; + if (!CheckNVRTCResult(dynload::mtrtcCreateProgram(&program, + kernel_.c_str(), // buffer + name_.c_str(), // name + 0, // numHeaders + nullptr, // headers + nullptr), // includeNames + "mtrtcCreateProgram")) { + return false; + } + + // Compile the program for specified compute_capability + auto* dev_ctx = reinterpret_cast( + DeviceContextPool::Instance().Get(place_)); + int compute_capability = dev_ctx->GetComputeCapability(); + std::string compute_flag = + "--gpu-architecture=compute_" + std::to_string(compute_capability); + std::vector options = {"--std=c++11", compute_flag.c_str()}; + std::string include_option; + if (include_path) { + std::string cuda_include_path = FindCUDAIncludePath(); + if (!cuda_include_path.empty()) { + include_option = "--include-path=" + cuda_include_path; + options.push_back(include_option.c_str()); + } + } + mtrtcResult compile_result = + dynload::mtrtcCompileProgram(program, // program + options.size(), // numOptions + options.data()); // options + if (compile_result == MTRTC_ERROR_COMPILATION) { + // Obtain compilation log from the program + size_t log_size; + if (!CheckNVRTCResult(dynload::mtrtcGetProgramLogSize(program, &log_size), + "mtrtcGetProgramLogSize")) { + return false; + } + std::vector log; + log.resize(log_size + 1); + if (!CheckNVRTCResult(dynload::mtrtcGetProgramLog(program, log.data()), + "nvrtcGetProgramLog")) { + return false; + } + LOG(WARNING) << "JIT compiling of MUSA code failed:" + << "\n Kernel name: " << name_ << "\n Kernel body:\n" + << kernel_ << "\n Compiling log: " << log.data(); + + return false; + } + + // Obtain PTX from the program + size_t ptx_size; + if (!CheckNVRTCResult(dynload::mtrtcGetMUSASize(program, &ptx_size), + "mtrtcGetMUSASize")) { + return false; + } + ptx_.resize(ptx_size + 1); + if (!CheckNVRTCResult(dynload::mtrtcGetMUSA(program, ptx_.data()), + "mtrtcGetMUSA")) { + return false; + } + + if (!CheckNVRTCResult(dynload::mtrtcDestroyProgram(&program), + "mtrtcDestroyProgram")) { + return false; + } + + if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()), + "muModuleLoadData", + name_)) { + return false; + } + + if (!CheckCUDADriverResult( + dynload::muModuleGetFunction(&function_, module_, name_.c_str()), + "muModuleGetFunction", + name_)) { + return false; + } #else nvrtcProgram program; if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program, @@ -436,6 +540,22 @@ void GPUDeviceCode::Launch(const size_t n, std::vector* args) const { hipSuccess, errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)", name_.c_str())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_EQ( + dynload::muLaunchKernel(function_, + num_blocks, + 1, + 1, // grid dim + num_threads_, + 1, + 1, // block dim + 0, // shared memory + dev_ctx->stream(), // stream + args->data(), // arguments + nullptr), + MUSA_SUCCESS, + errors::External("Fail to launch kernel %s (in muLaunchKernel.)", + name_.c_str())); #else PADDLE_ENFORCE_EQ( dynload::cuLaunchKernel(function_, @@ -464,6 +584,18 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result, << " > failed: " << dynload::hiprtcGetErrorString(result); return false; } + return true; +} +#elif defined(PADDLE_WITH_MUSA) +bool GPUDeviceCode::CheckNVRTCResult(mtrtcResult result, std::string function) { + if (result != MTRTC_SUCCESS) { + LOG_FIRST_N(WARNING, 1) + << "Call " << function << " for < " << name_ + << " > failed: " << dynload::mtrtcGetErrorString(result); + return false; + } + return true; +} #else bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { if (result != NVRTC_SUCCESS) { @@ -472,9 +604,9 @@ bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { << " > failed: " << dynload::nvrtcGetErrorString(result); return false; } -#endif return true; } #endif +#endif } // namespace phi diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h index 8debb4dc9c45e..62aea0c1c6ffb 100644 --- a/paddle/phi/backends/device_code.h +++ b/paddle/phi/backends/device_code.h @@ -26,6 +26,10 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/cuda_driver.h" #include "paddle/phi/backends/dynload/nvrtc.h" #endif +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/musa_driver.h" +#include "paddle/phi/backends/dynload/musartc.h" +#endif #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hiprtc.h" #include "paddle/phi/backends/dynload/rocm_driver.h" @@ -48,7 +52,8 @@ class DeviceCode { std::string kernel_; }; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) class GPUDeviceCode : public DeviceCode { public: explicit GPUDeviceCode(const Place& place, @@ -68,6 +73,8 @@ class GPUDeviceCode : public DeviceCode { private: #ifdef PADDLE_WITH_HIP bool CheckNVRTCResult(hiprtcResult result, std::string function); +#elif defined(PADDLE_WITH_MUSA) + bool CheckNVRTCResult(mtrtcResult result, std::string function); #else bool CheckNVRTCResult(nvrtcResult result, std::string function); #endif @@ -82,6 +89,9 @@ class GPUDeviceCode : public DeviceCode { #ifdef PADDLE_WITH_HIP hipModule_t module_; hipFunction_t function_; +#elif defined(PADDLE_WITH_MUSA) + MUmodule module_; + MUfunction function_; #else CUmodule module_; CUfunction function_; diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h index 8508d5206558d..1c47183f0f123 100644 --- a/paddle/phi/backends/device_memory_aligment.h +++ b/paddle/phi/backends/device_memory_aligment.h @@ -36,7 +36,8 @@ inline size_t Alignment(size_t size, if (place.GetType() == phi::AllocationType::CPU) { alignment = phi::backends::cpu::CpuMinChunkSize(); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) alignment = phi::backends::gpu::GpuMinChunkSize(); #elif defined(PADDLE_WITH_XPU) alignment = phi::backends::xpu::XPUMinChunkSize(); diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 838b623ae7b38..b57c5d096fb2c 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -30,6 +30,17 @@ if(WITH_ROCM) rocsparse.cc) endif() +if(WITH_MUSA) + list( + APPEND + MUSA_SRCS + mudnn.cc + mublas.cc + musparse.cc + murand.cc + mccl.cc) +endif() + # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on macOS, and only do an early test on Linux and Windows. if(NOT APPLE) @@ -46,6 +57,9 @@ if(NOT APPLE) list(APPEND HIP_SRCS cupti.cc) endif() endif() + if(WITH_MUSA) + list(APPEND MUSA_SRCS musartc.cc musa_driver.cc) + endif() endif() if(TENSORRT_FOUND) @@ -93,6 +107,8 @@ if(WITH_ROCM) collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS}) elseif(WITH_GPU) collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS}) +elseif(WITH_MUSA) + collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${MUSA_SRCS}) else() collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS}) endif() diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index 354ff5b7dc855..ac06fb70e57cc 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -95,6 +95,27 @@ PHI_DEFINE_string(rccl_dir, "dlopen will search rccl from LD_LIBRARY_PATH"); #endif +#ifdef PADDLE_WITH_MUSA + +PHI_DEFINE_string(mudnn_dir, + "", + "Specify path for loading libmudnn.so. For instance, " + "/usr/local/musa/lib. If empty [default], dlopen " + "will search libmudnn.so from LD_LIBRARY_PATH"); + +PHI_DEFINE_string(musa_dir, + "", + "Specify path for loading musa library, such as libmublas, " + "libmurand, libmusparse. For instance, /usr/local/musa/lib. " + "If default, dlopen will search rocm from LD_LIBRARY_PATH"); + +PHI_DEFINE_string(mccl_dir, + "", + "Specify path for loading mccl library, such as libmccl.so. " + "For instance, /usr/local/musa/lib. If default, " + "dlopen will search mccl from LD_LIBRARY_PATH"); +#endif + #ifdef PADDLE_WITH_XPU DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so."); #endif @@ -319,6 +340,8 @@ void* GetCublasDsoHandle() { FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so"); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmublas.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif @@ -360,6 +383,8 @@ void* GetCUDNNDsoHandle() { FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_mudnn_dir, "libmudnn.so", false); #else return GetDsoHandleFromSearchPath( FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path}); @@ -384,6 +409,8 @@ void* GetCurandDsoHandle() { FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path}); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmurand.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif @@ -429,6 +456,8 @@ void* GetCusparseDsoHandle() { FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so"); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusparse.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so"); #endif @@ -439,6 +468,8 @@ void* GetNVRTCDsoHandle() { return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusart.so", false); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false); #endif @@ -449,6 +480,8 @@ void* GetCUDADsoHandle() { return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false); #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false); +#elif defined(PADDLE_WITH_MUSA) + return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusa.so", false); #elif defined(_WIN32) char system32_dir[MAX_PATH]; GetSystemDirectory(system32_dir, MAX_PATH); @@ -506,6 +539,10 @@ void* GetNCCLDsoHandle() { "You may need to install 'rccl' from ROCM official website: " "https://rocmdocs.amd.com/en/latest/Installation_Guide/" "Installation-Guide.html before install PaddlePaddle."); +#elif defined(PADDLE_WITH_MUSA) + std::string warning_msg( + "You may need to install 'mccl' from MUSA official website" + " before install PaddlePaddle."); #else std::string warning_msg( "You may need to install 'nccl2' from NVIDIA official website: " @@ -519,6 +556,9 @@ void* GetNCCLDsoHandle() { #elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) return GetDsoHandleFromSearchPath( FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg); +#elif defined(PADDLE_WITH_MUSA) && defined(PADDLE_WITH_MCCL) + return GetDsoHandleFromSearchPath( + FLAGS_mccl_dir, "libmccl.so", true, {}, warning_msg); #else return GetDsoHandleFromSearchPath( FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg); diff --git a/paddle/phi/backends/dynload/mccl.cc b/paddle/phi/backends/dynload/mccl.cc new file mode 100644 index 0000000000000..d6f0208780de8 --- /dev/null +++ b/paddle/phi/backends/dynload/mccl.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/mccl.h" + +namespace phi { +namespace dynload { + +std::once_flag mccl_dso_flag; +void *mccl_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mccl.h b/paddle/phi/backends/dynload/mccl.h new file mode 100644 index 0000000000000..19ab0246f99d7 --- /dev/null +++ b/paddle/phi/backends/dynload/mccl.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag mccl_dso_flag; +extern void* mccl_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using mccl_func = decltype(&::__name); \ + std::call_once(mccl_dso_flag, []() { \ + mccl_dso_handle = phi::dynload::GetNCCLDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(mccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define MCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(mcclCommInitAll); \ + __macro(mcclGetUniqueId); \ + __macro(mcclCommInitRank); \ + __macro(mcclCommDestroy); \ + __macro(mcclCommCount); \ + __macro(mcclCommCuDevice); \ + __macro(mcclCommUserRank); \ + __macro(mcclAllReduce); \ + __macro(mcclBcast); \ + __macro(mcclAllGather); \ + __macro(mcclGroupStart); \ + __macro(mcclGroupEnd); \ + __macro(mcclReduce); \ + __macro(mcclReduceScatter); \ + __macro(mcclGetErrorString); \ + __macro(mcclBroadcast); \ + __macro(mcclGetVersion); \ + __macro(mcclSend); \ + __macro(mcclRecv); \ + __macro(mcclRedOpCreatePreMulSum); \ + __macro(mcclRedOpDestroy); + +MCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MCCL_WRAP) + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mublas.cc b/paddle/phi/backends/dynload/mublas.cc new file mode 100644 index 0000000000000..72c0e9954311e --- /dev/null +++ b/paddle/phi/backends/dynload/mublas.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/mublas.h" + +namespace phi { +namespace dynload { +std::once_flag mublas_dso_flag; +void *mublas_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mublas.h b/paddle/phi/backends/dynload/mublas.h new file mode 100644 index 0000000000000..3b91a703f5775 --- /dev/null +++ b/paddle/phi/backends/dynload/mublas.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include // NOLINT +#include + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag mublas_dso_flag; +extern void *mublas_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load mublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using mublas_func = \ + decltype(::__name(std::declval()...)) (*)(Args...); \ + std::call_once(mublas_dso_flag, []() { \ + mublas_dso_handle = phi::dynload::GetCublasDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(mublas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define MUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(mublasSaxpy); \ + __macro(mublasDaxpy); \ + __macro(mublasCaxpy); \ + __macro(mublasZaxpy); \ + __macro(mublasSscal); \ + __macro(mublasDscal); \ + __macro(mublasScopy); \ + __macro(mublasDcopy); \ + __macro(mublasSgemv); \ + __macro(mublasDgemv); \ + __macro(mublasCgemv); \ + __macro(mublasZgemv); \ + __macro(mublasSgemm); \ + __macro(mublasDgemm); \ + __macro(mublasCgemm); \ + __macro(mublasZgemm); \ + __macro(mublasHgemm); \ + __macro(mublasSgeam); \ + __macro(mublasDgeam); \ + __macro(mublasDtrsm); \ + __macro(mublasCtrsm); \ + __macro(mublasZtrsm); \ + __macro(mublasCreate); \ + __macro(mublasDestroy); \ + __macro(mublasSetStream); \ + __macro(mublasSetPointerMode); \ + __macro(mublasGetPointerMode); \ + __macro(mublasSgemmBatched); \ + __macro(mublasDgemmBatched); \ + __macro(mublasCgemmBatched); \ + __macro(mublasZgemmBatched); + +MUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP) + +#undef DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mudnn.cc b/paddle/phi/backends/dynload/mudnn.cc new file mode 100644 index 0000000000000..87b51cb8bb56a --- /dev/null +++ b/paddle/phi/backends/dynload/mudnn.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/mudnn.h" + +namespace phi { +namespace dynload { + +bool HasCUDNN() { + // note: mudnn.so is not imported by dlopen, which will be linked + // in cmakelist.txt. + return true; +} + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/mudnn.h b/paddle/phi/backends/dynload/mudnn.h new file mode 100644 index 0000000000000..66ba6a21b28cf --- /dev/null +++ b/paddle/phi/backends/dynload/mudnn.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_MUSA + +namespace phi { +namespace dynload { + +extern bool HasCUDNN(); + +} // namespace dynload +} // namespace phi +#endif diff --git a/paddle/phi/backends/dynload/murand.cc b/paddle/phi/backends/dynload/murand.cc new file mode 100644 index 0000000000000..bd88319b0d524 --- /dev/null +++ b/paddle/phi/backends/dynload/murand.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/murand.h" + +namespace phi { +namespace dynload { + +std::once_flag murand_dso_flag; +void *murand_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/murand.h b/paddle/phi/backends/dynload/murand.h new file mode 100644 index 0000000000000..64aa082b5a1b8 --- /dev/null +++ b/paddle/phi/backends/dynload/murand.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { +extern std::once_flag murand_dso_flag; +extern void *murand_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_MURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + murandStatus_t operator()(Args... args) { \ + using murandFunc = decltype(&::__name); \ + std::call_once(murand_dso_flag, []() { \ + murand_dso_handle = phi::dynload::GetCurandDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(murand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define MURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(murandCreateGenerator); \ + __macro(murandSetStream); \ + __macro(murandSetPseudoRandomGeneratorSeed); \ + __macro(murandGenerateUniform); \ + __macro(murandGenerateUniformDouble); \ + __macro(murandGenerateNormal); \ + __macro(murandDestroyGenerator); + +MURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MURAND_WRAP); + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musa_driver.cc b/paddle/phi/backends/dynload/musa_driver.cc new file mode 100644 index 0000000000000..2173a8d6cdd81 --- /dev/null +++ b/paddle/phi/backends/dynload/musa_driver.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/musa_driver.h" + +namespace phi { +namespace dynload { + +std::once_flag musa_dso_flag; +void* musa_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUSA_ROUTINE_EACH(DEFINE_WRAP); + +bool HasCUDADriver() { + std::call_once(musa_dso_flag, []() { musa_dso_handle = GetCUDADsoHandle(); }); + return musa_dso_handle != nullptr; +} + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h new file mode 100644 index 0000000000000..3534ab8213c93 --- /dev/null +++ b/paddle/phi/backends/dynload/musa_driver.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag musa_dso_flag; +extern void* musa_dso_handle; +extern bool HasCUDADriver(); + +#define DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using musa_func = decltype(&::__name); \ + std::call_once(musa_dso_flag, []() { \ + musa_dso_handle = phi::dynload::GetCUDADsoHandle(); \ + }); \ + static void* p_##__name = dlsym(musa_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed musa driver functions + **/ +#define MUSA_ROUTINE_EACH(__macro) \ + __macro(muInit); \ + __macro(muDriverGetVersion); \ + __macro(muGetErrorString); \ + __macro(muModuleLoadData); \ + __macro(muModuleGetFunction); \ + __macro(muModuleUnload); \ + __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \ + __macro(muLaunchKernel); \ + __macro(muCtxCreate); \ + __macro(muCtxGetCurrent); \ + __macro(muDeviceGetCount); \ + __macro(muDevicePrimaryCtxGetState); \ + __macro(muDeviceGetAttribute); \ + __macro(muDeviceGet); + +MUSA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSA_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_MUSA_WRAP + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musartc.cc b/paddle/phi/backends/dynload/musartc.cc new file mode 100644 index 0000000000000..9cd25270a1016 --- /dev/null +++ b/paddle/phi/backends/dynload/musartc.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/musartc.h" + +namespace phi { +namespace dynload { + +std::once_flag musartc_dso_flag; +void* musartc_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUSARTC_ROUTINE_EACH(DEFINE_WRAP); + +bool HasNVRTC() { + std::call_once(musartc_dso_flag, + []() { musartc_dso_handle = GetNVRTCDsoHandle(); }); + return musartc_dso_handle != nullptr; +} + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musartc.h b/paddle/phi/backends/dynload/musartc.h new file mode 100644 index 0000000000000..317621090a5b3 --- /dev/null +++ b/paddle/phi/backends/dynload/musartc.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { + +extern std::once_flag musartc_dso_flag; +extern void* musartc_dso_handle; +extern bool HasNVRTC(); + +#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using musartc_func = decltype(&::__name); \ + std::call_once(musartc_dso_flag, []() { \ + musartc_dso_handle = phi::dynload::GetNVRTCDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(musartc_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed musartc functions + **/ +#define MUSARTC_ROUTINE_EACH(__macro) \ + __macro(mtrtcVersion); \ + __macro(mtrtcGetErrorString); \ + __macro(mtrtcCompileProgram); \ + __macro(mtrtcCreateProgram); \ + __macro(mtrtcDestroyProgram); \ + __macro(mtrtcGetMUSA); \ + __macro(mtrtcGetMUSASize); \ + __macro(mtrtcGetProgramLog); \ + __macro(mtrtcGetProgramLogSize) + +MUSARTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musparse.cc b/paddle/phi/backends/dynload/musparse.cc new file mode 100644 index 0000000000000..35ccd602e63ba --- /dev/null +++ b/paddle/phi/backends/dynload/musparse.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/dynload/musparse.h" + +namespace phi { +namespace dynload { + +std::once_flag musparse_dso_flag; +void *musparse_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MUSPARSE_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/musparse.h b/paddle/phi/backends/dynload/musparse.h new file mode 100644 index 0000000000000..595e6d490d5e4 --- /dev/null +++ b/paddle/phi/backends/dynload/musparse.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include + +#include // NOLINT + +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/port.h" + +namespace phi { +namespace dynload { +extern std::once_flag musparse_dso_flag; +extern void *musparse_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + musparseStatus_t operator()(Args... args) { \ + using Func = decltype(&::__name); \ + std::call_once(musparse_dso_flag, []() { \ + musparse_dso_handle = phi::dynload::GetCusparseDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(musparse_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#if defined(PADDLE_WITH_MUSA) +#define MUSPARSE_ROUTINE_EACH(__macro) \ + __macro(musparseSetStream); \ + __macro(musparseCreateMatDescr); \ + __macro(musparseSnnz); \ + __macro(musparseDnnz); \ + __macro(musparseSetMatType); \ + __macro(musparseSetMatIndexBase); \ + __macro(musparseCreateCsr); \ + __macro(musparseCreateCoo); \ + __macro(musparseCreateDnMat); \ + __macro(musparseCreateDnVec); \ + __macro(musparseSpMM); \ + __macro(musparseDestroySpMat); \ + __macro(musparseDestroyDnMat); \ + __macro(musparseDestroyDnVec); \ + __macro(musparseSpMV); \ + __macro(musparseSDDMM_bufferSize); \ + __macro(musparseSDDMM_preprocess); \ + __macro(musparseSDDMM); \ + __macro(musparseDnMatSetStridedBatch); \ + __macro(musparseCooSetStridedBatch); \ + __macro(musparseCsrSetStridedBatch); + +MUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP) + +#endif // PADDLE_WITH_MUSA + +#undef DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h index e1f3492f76870..4437f9c315ff0 100644 --- a/paddle/phi/backends/gpu/forwards.h +++ b/paddle/phi/backends/gpu/forwards.h @@ -72,6 +72,13 @@ using cufftHandle = int; // Forward declaration of NCCL types. using ncclComm_t = struct ncclComm *; +// Forward declaration of MUSA runtime types. +using musaStream_t = struct MUstream_st *; +using musaEvent_t = struct MUevent_st *; +using mublasHandle_t = struct _mublasHandle_t *; +using mudnnHandle_t = class Handle *; +using musparseHandle_t = struct _musparse_handle *; + /// Forward declaration of ROCM types. #include diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 5c9c010d365e4..615ab755f8a78 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -43,6 +43,15 @@ limitations under the License. */ #endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mudnn.h" +#include "paddle/phi/backends/dynload/musparse.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#include "paddle/phi/backends/dynload/mccl.h" +#endif // !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/miopen.h" #include "paddle/phi/backends/dynload/rocblas.h" @@ -119,6 +128,9 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream())); @@ -143,11 +155,22 @@ static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status, void* user_data) #endif + +#ifdef PADDLE_WITH_MUSA +#if MUSA_VERSION >= 10000 + static void StreamCallbackFunc(void* user_data) +#else + static void StreamCallbackFunc(cudaStream_t stream, + cudaError_t status, + void* user_data) +#endif +#endif + #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10000 - static void CUDART_CB StreamCallbackFunc(void* user_data) + static void CUDART_CB StreamCallbackFunc(void* user_data) #else - static void CUDART_CB + static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data) #endif #endif @@ -170,6 +193,8 @@ void DnnWorkspaceHandle::RunFuncSync( std::lock_guard guard(*mtx_); #ifdef PADDLE_WITH_HIP auto status = hipMalloc(&workspace_ptr, size); +#elif defined(PADDLE_WITH_MUSA) + auto status = musaMalloc(&workspace_ptr, size); #else auto status = cudaMalloc(&workspace_ptr, size); #endif @@ -178,6 +203,8 @@ void DnnWorkspaceHandle::RunFuncSync( phi::backends::gpu::GpuStreamSync(stream_); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaFree(workspace_ptr)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr)); #endif @@ -248,7 +275,9 @@ struct GPUContext::Impl { DestoryInternalWorkspace(); DestoryInternalEigenDevice(); phi::DestroySparseHandle(sparse_handle_); +#ifndef PADDLE_WITH_MUSA phi::DestroySolverHandle(solver_handle_); +#endif phi::DestroyDnnHandle(dnn_handle_); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nccl_comm_) { @@ -264,7 +293,9 @@ struct GPUContext::Impl { phi::DestroyBlasHandle(blas_handle_); phi::DestroyBlasHandle(blas_tensor_core_handle_); phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_); +#ifndef PADDLE_WITH_MUSA phi::DestroyBlasLtHandle(blaslt_handle_); +#endif } if (stream_owned_ && stream_) { delete stream_; @@ -425,6 +456,7 @@ struct GPUContext::Impl { blas_tf32_tensor_core_handle_creator_ = std::move(handle_creator); } +#ifndef PADDLE_WITH_MUSA void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; } void SetBlasLtHandle(std::function&& handle_creator) { @@ -443,6 +475,7 @@ struct GPUContext::Impl { PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr."); return blaslt_handle_; } +#endif dnnHandle_t GetDnnHandle() { std::call_once(flag_dnn_, [&]() { @@ -464,7 +497,7 @@ struct GPUContext::Impl { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(dnn_handle_)); dnn_handle_ = nullptr; } -#else +#elif defined(PADDLE_WITH_CUDA) if (owned_ && dnn_handle_ != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_)); dnn_handle_ = nullptr; @@ -478,6 +511,7 @@ struct GPUContext::Impl { dnn_handle_creator_ = std::move(handle_creator); } +#ifndef PADDLE_WITH_MUSA solverHandle_t GetSolverHandle() { std::call_once(flag_slover_, [&]() { if (!solver_handle_) { @@ -497,6 +531,7 @@ struct GPUContext::Impl { void SetSolverHandle(std::function&& handle_creator) { solver_handle_creator_ = std::move(handle_creator); } +#endif sparseHandle_t GetSparseHandle() { std::call_once(flag_sparse_, [&]() { @@ -529,7 +564,19 @@ struct GPUContext::Impl { break; } #endif // !defined(_WIN32) -#else // PADDLE_WITH_HIP + +#elif defined(PADDLE_WITH_MUSA) + musaError_t e_sync = musaSuccess; +#if !defined(_WIN32) + e_sync = musaStreamSynchronize(stream()); +#else + while (e_sync = musaStreamQuery(stream())) { + if (e_sync == musaErrorNotReady) continue; + break; + } +#endif // !defined(_WIN32) + +#else // PADDLE_WITH_MUSA cudaError_t e_sync = cudaSuccess; #if !defined(_WIN32) e_sync = cudaStreamSynchronize(stream()); @@ -539,7 +586,7 @@ struct GPUContext::Impl { break; } #endif // !defined(_WIN32) -#endif // PADDLE_WITH_HIP +#endif // PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(e_sync); } @@ -547,6 +594,8 @@ struct GPUContext::Impl { void WaitEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream(), ev, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream(), ev, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream(), ev, 0)); #endif @@ -678,6 +727,8 @@ struct GPUContext::Impl { void RecordEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(ev, stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream())); #endif @@ -708,11 +759,17 @@ struct GPUContext::Impl { PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0)); #endif +#endif + +#ifdef PADDLE_WITH_MUSA + PADDLE_ENFORCE_GPU_SUCCESS( + musaLaunchHostFunc(stream(), internal::StreamCallbackFunc, func)); #endif } void WaitStreamCallback() const { -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || \ + defined(PADDLE_WITH_MUSA) phi::backends::gpu::GpuStreamSync(stream()); #endif { @@ -764,12 +821,16 @@ struct GPUContext::Impl { std::function blas_tensor_core_handle_creator_{nullptr}; blasHandle_t blas_tf32_tensor_core_handle_{nullptr}; std::function blas_tf32_tensor_core_handle_creator_{nullptr}; +#ifndef PADDLE_WITH_MUSA blasLtHandle_t blaslt_handle_{nullptr}; std::function blaslt_handle_creator_{nullptr}; +#endif dnnHandle_t dnn_handle_{nullptr}; std::function dnn_handle_creator_{nullptr}; +#ifndef PADDLE_WITH_MUSA solverHandle_t solver_handle_{nullptr}; std::function solver_handle_creator_{nullptr}; +#endif sparseHandle_t sparse_handle_{nullptr}; std::function sparse_handle_creator_{nullptr}; DnnWorkspaceHandle* workspace_{nullptr}; @@ -839,6 +900,7 @@ blasHandle_t GPUContext::cublas_handle() const { return impl_->GetBlasHandle(); } +#ifndef PADDLE_WITH_MUSA blasLtHandle_t GPUContext::cublaslt_handle() const { return impl_->GetBlasLtHandle(); } @@ -846,6 +908,7 @@ blasLtHandle_t GPUContext::cublaslt_handle() const { solverHandle_t GPUContext::cusolver_dn_handle() const { return impl_->GetSolverHandle(); } +#endif sparseHandle_t GPUContext::cusparse_handle() const { return impl_->GetSparseHandle(); @@ -965,6 +1028,7 @@ void GPUContext::SetBlasTF32Handle(std::function&& func) { impl_->SetBlasTF32Handle(std::move(func)); } +#ifndef PADDLE_WITH_MUSA void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) { impl_->SetBlasLtHandle(blaslt); } @@ -972,6 +1036,7 @@ void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) { void GPUContext::SetBlasLtHandle(std::function&& func) { impl_->SetBlasLtHandle(std::move(func)); } +#endif void GPUContext::SetDnnHandle(dnnHandle_t handle) { impl_->SetDnnHandle(handle); @@ -981,6 +1046,7 @@ void GPUContext::SetDnnHandle(std::function&& func) { impl_->SetDnnHandle(std::move(func)); } +#ifndef PADDLE_WITH_MUSA void GPUContext::SetSolverHandle(solverHandle_t handle) { impl_->SetSolverHandle(handle); } @@ -988,6 +1054,7 @@ void GPUContext::SetSolverHandle(solverHandle_t handle) { void GPUContext::SetSolverHandle(std::function&& func) { impl_->SetSolverHandle(std::move(func)); } +#endif void GPUContext::SetSparseHandle(sparseHandle_t handle) { impl_->SetSparseHandle(handle); @@ -1046,7 +1113,8 @@ void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { void GPUContext::ClearDnnAttr() { return impl_->ClearDnnAttr(); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) GPUPinnedContext::GPUPinnedContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index b4a3974378241..ce92612304cda 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -16,7 +16,7 @@ limitations under the License. */ #pragma once #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU_KP) + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU_KP) #include #include @@ -108,11 +108,13 @@ class PADDLE_API GPUContext : public DeviceContext, /*! \brief Return cublas handle in the device context. */ blasHandle_t cublas_handle() const; +#ifndef PADDLE_WITH_MUSA /*! \brief Return cublasLt handle in the device context. */ blasLtHandle_t cublaslt_handle() const; /*! \brief Return cusolver handle in the device context. */ solverHandle_t cusolver_dn_handle() const; +#endif /*! \brief Return cusparse handle in the device context. */ sparseHandle_t cusparse_handle() const; @@ -232,14 +234,18 @@ class PADDLE_API GPUContext : public DeviceContext, void SetBlasTF32Handle(blasHandle_t); void SetBlasTF32Handle(std::function&&); +#ifndef PADDLE_WITH_MUSA void SetBlasLtHandle(blasLtHandle_t); void SetBlasLtHandle(std::function&&); +#endif void SetDnnHandle(dnnHandle_t); void SetDnnHandle(std::function&&); +#ifndef PADDLE_WITH_MUSA void SetSolverHandle(solverHandle_t); void SetSolverHandle(std::function&&); +#endif void SetSparseHandle(sparseHandle_t); void SetSparseHandle(std::function&&); @@ -276,7 +282,8 @@ using GPUDNNContext = GPUContext; // because we want to implement a KPS-based kernel and make it run // on GPU and XPU at the same time, so we need KPSContext when registering // KPS Kernel. Note: XPU and GPU cannot be compiled at the same time! -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) using KPSContext = GPUContext; #endif @@ -287,7 +294,8 @@ struct DefaultDevice; } // namespace Eigen namespace phi { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // Currently, GPUPinnedContext is only used to data copying. class GPUPinnedContext : public DeviceContext, diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h index 4a6b9d2fd87f1..65ec52c5476d0 100644 --- a/paddle/phi/backends/gpu/gpu_decls.h +++ b/paddle/phi/backends/gpu/gpu_decls.h @@ -20,18 +20,39 @@ namespace phi { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = ROCM_TYPE; -#else // PADDLE_WITH_CDUA - -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + using GPU_TYPE = MUSA_TYPE; +#else +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = CUDA_TYPE; -#endif +#endif // PADDLE_WITH_CDUA -DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t); -DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t); +DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t); +DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t); +DECLARE_TYPE_FOR_GPU(sparseHandle_t, + cusparseHandle_t, + rocsparse_handle, + musparseHandle_t); +DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t); +DECLARE_TYPE_FOR_GPU(blasHandle_t, + cublasHandle_t, + rocblas_handle, + mublasHandle_t); +#undef DECLARE_TYPE_FOR_GPU + +#ifndef PADDLE_WITH_MUSA +#ifdef PADDLE_WITH_HIP +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = ROCM_TYPE; +#else +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = CUDA_TYPE; +#endif // PADDLE_WITH_CDUA DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, cudnnActivationStruct, miopenActivationDescriptor); @@ -56,19 +77,13 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, miopenDropoutDescriptor_t); -DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); - -DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workround. DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); - DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle); - -DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle); - #undef DECLARE_TYPE_FOR_GPU +#endif using CUDAGraphID = unsigned long long; // NOLINT diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h index 0f79e2a645ab3..a5728c25012f9 100644 --- a/paddle/phi/backends/gpu/gpu_device_function.h +++ b/paddle/phi/backends/gpu/gpu_device_function.h @@ -13,10 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/gpu/musa/musa_device_function.h" #else #include "paddle/phi/backends/gpu/cuda/cuda_device_function.h" #endif diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h index f37afa3deeb74..b67010344a64e 100644 --- a/paddle/phi/backends/gpu/gpu_dnn.h +++ b/paddle/phi/backends/gpu/gpu_dnn.h @@ -14,11 +14,14 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/miopen_desc.h" #include "paddle/phi/backends/gpu/rocm/miopen_helper.h" +#elif defined(PADDLE_WITH_MUSA) + #else // CUDA #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h" #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h" diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h index 2353b42794ffd..456681bb2b5d6 100644 --- a/paddle/phi/backends/gpu/gpu_helper.h +++ b/paddle/phi/backends/gpu/gpu_helper.h @@ -13,10 +13,13 @@ // limitations under the License. #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/gpu/rocm/rocm_helper.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/gpu/musa/musa_helper.h" #else #include "paddle/phi/backends/gpu/cuda/cuda_helper.h" #endif diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h index ebf57bd06eb19..70b4ebd21294e 100644 --- a/paddle/phi/backends/gpu/gpu_info.h +++ b/paddle/phi/backends/gpu/gpu_info.h @@ -11,7 +11,8 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index a7a7ad03ad664..5080a714bebb3 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -16,10 +16,13 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_CUDA #include +#elif defined(PADDLE_WITH_MUSA) +#include #else #include #endif diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h index a77527c081650..d46ada073c47d 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -16,6 +16,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif #ifdef PADDLE_WITH_HIP #include #endif @@ -58,7 +61,8 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) { static_cast(val)); // NOLINT } -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) +#if defined(__HIPCC__) || defined(__MUSACC__) || \ + (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) USE_CUDA_ATOMIC(Add, double); #else CUDA_ATOMIC_WRAPPER(Add, double) { diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc index a447df94cb4dc..2bee37b300258 100644 --- a/paddle/phi/backends/gpu/gpu_resources.cc +++ b/paddle/phi/backends/gpu/gpu_resources.cc @@ -33,6 +33,15 @@ #endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mudnn.h" +#include "paddle/phi/backends/dynload/musparse.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +#include "paddle/phi/backends/dynload/mccl.h" +#endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/rocsparse.h" #endif @@ -64,10 +73,9 @@ void InitGpuProperties(Place place, *driver_version = backends::gpu::GetGPUDriverVersion(place.GetDeviceId()); *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId()); +#ifdef PADDLE_WITH_CUDA const gpuDeviceProp& prop = backends::gpu::GetDeviceProperties(place.GetDeviceId()); - -#ifdef PADDLE_WITH_CUDA static const std::set compiled_archs{CUDA_REAL_ARCHS}; // Make sure compiled cuda arch is as same as runtime cuda arch. if (compiled_archs.find(*compute_capability) == compiled_archs.cend() && @@ -144,6 +152,47 @@ void InitGpuProperties(Place place, << "Please recompile or reinstall Paddle with compatible MIOPEN " "version."; } +#elif defined(PADDLE_WITH_MUSA) + // TODO(@caizhi): mudnnGetVersion is not supported for MUSA now. + // Requests have been submitted to Mudnn. + // size_t mudnn_dso_ver = dynload::mudnnGetVersion(); + size_t mudnn_dso_ver = 1100; + LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place.device) + << ", muDNN Version: " << mudnn_dso_ver / 1000 << "." + << (mudnn_dso_ver % 1000) / 100 << "."; + + // Check MUSA/MUDNN version compatiblity + auto local_musa_version = + (*driver_version / 1000) * 10 + (*driver_version % 100) / 10; + auto compile_musa_version = + (MUSA_VERSION / 1000) * 10 + (MUSA_VERSION % 100) / 10; +#if defined(__linux__) + PADDLE_ENFORCE_EQ( + (local_musa_version / 10 < compile_musa_version / 10) && + (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000), + false, + phi::errors::InvalidArgument( + "The installed Paddle is compiled with MUDA%d/muDNN%d," + "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. " + "which will cause serious incompatible bug. " + "Please recompile or reinstall Paddle with compatible MUSA/muDNN " + "version.", + compile_musa_version / 10, + MUDNN_VERSION / 1000, + local_musa_version / 10, + mudnn_dso_ver / 1000)); +#endif + if (local_musa_version < compile_musa_version) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << static_cast(place.device) + << ". The installed Paddle is compiled with MUSA " + << compile_musa_version / 10 << "." << compile_musa_version % 10 + << ", but MUSA runtime version in your machine is " + << local_musa_version / 10 << "." << local_musa_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible MUSA " + "version."; + } #else size_t cudnn_dso_ver = dynload::cudnnGetVersion(); LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place.device) @@ -189,6 +238,9 @@ void InitStream(gpuStream_t* stream) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamCreateWithPriority(stream, hipStreamDefault, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaStreamCreateWithPriority(stream, musaStreamDefault, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0)); @@ -199,6 +251,8 @@ void DestoryStream(gpuStream_t stream) { if (stream != nullptr) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); #endif @@ -210,7 +264,11 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) { #ifdef PADDLE_WITH_HIP phi::dynload::rocblas_create_handle(blas_handle); phi::dynload::rocblas_set_stream(*blas_handle, stream); -#else // PADDLE_WITH_CUDA +#elif defined(PADDLE_WITH_MUSA) + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::mublasCreate(blas_handle)); + PADDLE_RETRY_CUDA_SUCCESS( + phi::dynload::mublasSetStream(*blas_handle, stream)); +#else // PADDLE_WITH_MUSA PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle)); PADDLE_RETRY_CUDA_SUCCESS( phi::dynload::cublasSetStream(*blas_handle, stream)); @@ -223,6 +281,11 @@ void DestroyBlasHandle(blasHandle_t handle) { phi::dynload::rocblas_destroy_handle(handle); handle = nullptr; } +#elif defined(PADDLE_WITH_MUSA) + if (handle != nullptr) { + phi::dynload::mublasDestroy(handle); + handle = nullptr; + } #else if (handle != nullptr) { phi::dynload::cublasDestroy(handle); @@ -231,6 +294,7 @@ void DestroyBlasHandle(blasHandle_t handle) { #endif // PADDLE_WITH_HIP } +#ifndef PADDLE_WITH_MUSA void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) { #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 phi::dynload::cublasLtCreate(blaslt_handle); @@ -245,6 +309,7 @@ void DestroyBlasLtHandle(blasLtHandle_t handle) { } #endif } +#endif void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) { if (phi::dynload::HasCUDNN()) { @@ -268,7 +333,7 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) { } PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream)); -#else +#elif defined(PADDLE_WITH_CUDA) auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100; auto compile_cudnn_version = CUDNN_VERSION / 100; if (local_cudnn_version < static_cast(compile_cudnn_version)) { @@ -296,6 +361,12 @@ void DestroyDnnHandle(dnnHandle_t handle) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle)); handle = nullptr; } +#elif defined(PADDLE_WITH_MUSA) + if (handle != nullptr) { + // TODO(@caizhi): enable dynload module + // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(handle)); + handle = nullptr; + } #else if (handle != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle)); @@ -304,21 +375,23 @@ void DestroyDnnHandle(dnnHandle_t handle) { #endif // PADDLE_WITH_HIP } +#ifndef PADDLE_WITH_MUSA void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) { -#ifndef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_CUDA PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle)); PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream)); #endif } void DestroySolverHandle(solverHandle_t solver_handle) { -#ifndef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_CUDA if (solver_handle != nullptr) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle)); solver_handle = nullptr; } #endif } +#endif void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) { // ROCM is not yet supported diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h index 7bec5eebf5886..16d63910b8f4a 100644 --- a/paddle/phi/backends/gpu/gpu_resources.h +++ b/paddle/phi/backends/gpu/gpu_resources.h @@ -35,14 +35,18 @@ void DestoryStream(gpuStream_t stream); void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream); void DestroyBlasHandle(blasHandle_t handle); +#ifndef PADDLE_WITH_MUSA void InitBlasLtHandle(blasLtHandle_t* blaslt_handle); void DestroyBlasLtHandle(blasLtHandle_t handle); +#endif void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place); void DestroyDnnHandle(dnnHandle_t handle); +#ifndef PADDLE_WITH_MUSA void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream); void DestroySolverHandle(solverHandle_t solver_handle); +#endif void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream); void DestroySparseHandle(sparseHandle_t handle); diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h index 77f403795b6b3..00c0bdf6c545b 100644 --- a/paddle/phi/backends/gpu/gpu_types.h +++ b/paddle/phi/backends/gpu/gpu_types.h @@ -17,11 +17,15 @@ #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/miopen.h" #include "paddle/phi/backends/dynload/rocblas.h" +#elif defined(PADDLE_WITH_MUSA) +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mudnn.h" #else // PADDLE_WITH_CUDA #include "paddle/phi/backends/dynload/cublas.h" #include "paddle/phi/backends/dynload/cudnn.h" @@ -30,18 +34,39 @@ namespace phi { #ifdef PADDLE_WITH_HIP -#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ using GPU_TYPE = ROCM_TYPE; -#else // PADDLE_WITH_CDUA +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + using GPU_TYPE = MUSA_TYPE; + +#else // PADDLE_WITH_MUSA +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \ + using GPU_TYPE = CUDA_TYPE; +#endif // PADDLE_WITH_CUDA + +DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t); +DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, + cudaMemcpyKind, + hipMemcpyKind, + musaMemcpyKind); +DECLARE_TYPE_FOR_GPU(gpuDeviceProp, + cudaDeviceProp, + hipDeviceProp_t, + musaDeviceProp); +#undef DECLARE_TYPE_FOR_GPU + +#ifndef PADDLE_WITH_MUSA +#ifdef PADDLE_WITH_HIP +#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ + using GPU_TYPE = ROCM_TYPE; +#else // PADDLE_WITH_MUSA #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = CUDA_TYPE; -#endif +#endif // PADDLE_WITH_CUDA -DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t); -DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind); -DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t); DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t); DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t); DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, @@ -50,34 +75,45 @@ DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, cudnnActivationMode_t, miopenActivationMode_t); - #undef DECLARE_TYPE_FOR_GPU +#endif #ifdef PADDLE_WITH_HIP -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = ROCM_CV; +#elif defined(PADDLE_WITH_MUSA) +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ + constexpr auto GPU_CV = MUSA_CV; #else // PADDLE_WITH_CUDA -#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ +#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \ constexpr auto GPU_CV = CUDA_CV; #endif DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation, - hipErrorOutOfMemory); -DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); -DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); + hipErrorOutOfMemory, + musaErrorMemoryAllocation); +DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, + cudaErrorNotReady, + hipErrorNotReady, + musaErrorNotReady); +DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice, cudaMemcpyKind::cudaMemcpyHostToDevice, - hipMemcpyKind::hipMemcpyHostToDevice); + hipMemcpyKind::hipMemcpyHostToDevice, + musaMemcpyKind::musaMemcpyHostToDevice); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost, cudaMemcpyKind::cudaMemcpyDeviceToHost, - hipMemcpyKind::hipMemcpyDeviceToHost); + hipMemcpyKind::hipMemcpyDeviceToHost, + musaMemcpyKind::musaMemcpyDeviceToHost); DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice, cudaMemcpyKind::cudaMemcpyDeviceToDevice, - hipMemcpyKind::hipMemcpyDeviceToDevice); + hipMemcpyKind::hipMemcpyDeviceToDevice, + musaMemcpyKind::musaMemcpyDeviceToDevice); #undef DECLARE_CONSTANT_FOR_GPU } // namespace phi -#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || + // defined(PADDLE_WITH_MUSA ) diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h new file mode 100644 index 0000000000000..074bb2ba0cbff --- /dev/null +++ b/paddle/phi/backends/gpu/musa/musa_device_function.h @@ -0,0 +1,189 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#define PADDLE_CUDA_FP16 +// NOTE(): support float16 to half in header file. +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/enforce.h" + +namespace phi { +namespace backends { +namespace gpu { + +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) + +#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kPowerOfTwoDim = (dim); \ + __VA_ARGS__; \ + } break + +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); + +template +__forceinline__ __device__ T +CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { + return __shfl_down_sync(mask, val, static_cast(delta), width); +} + +template +__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, + T val, + int width = warpSize) { + return __shfl_xor_sync(mask, val, width); +} + +template <> +__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( + unsigned mask, phi::dtype::float16 val, int delta, int width) { + return phi::dtype::float16(__shfl_down_sync( + mask, val.to_half(), static_cast(delta), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( + unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { +#if defined(PADDLE_MUSA_BF16) && defined(__MUSA_ARCH__) && __MUSA_ARCH__ >= 220 + return phi::dtype::bfloat16(__shfl_down_sync( + mask, val.to_mt_bfloat16(), static_cast(delta), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + float real = static_cast(__shfl_down_sync( + mask, static_cast(val.real), static_cast(delta), width)); + float imag = static_cast(__shfl_down_sync( + mask, static_cast(val.imag), static_cast(delta), width)); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + double real = + static_cast(__shfl_down_sync(mask, + static_cast(val.real), + static_cast(delta), + width)); + double imag = + static_cast(__shfl_down_sync(mask, + static_cast(val.imag), + static_cast(delta), + width)); + return phi::dtype::complex(real, imag); +} + +// TODO(@MTAI): there is compiling error when compiling the following code +// template <> +// __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( +// unsigned mask, phi::dtype::float16 val, int width) { +// return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width)); +// } + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( + unsigned mask, phi::dtype::bfloat16 val, int width) { +#if defined(PADDLE_MUSA_BF16) + return phi::dtype::bfloat16( + __shfl_xor_sync(mask, val.to_mt_bfloat16(), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + float real = static_cast( + __shfl_xor_sync(mask, static_cast(val.real), width)); + float imag = static_cast( + __shfl_xor_sync(mask, static_cast(val.imag), width)); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + double real = static_cast( + __shfl_xor_sync(mask, static_cast(val.real), width)); + double imag = static_cast( + __shfl_xor_sync(mask, static_cast(val.imag), width)); + return phi::dtype::complex(real, imag); +} + +template +__forceinline__ __device__ T +CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { + return __shfl_sync(mask, val, src_line, width); +} + +template +HOSTDEVICE T Infinity() { + return INFINITY; +} + +template +__device__ T reduceSum(T val, int tid, int len) { + // NOTE(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. + const int warpSize = 32; + __shared__ T shm[warpSize]; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + __syncthreads(); + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + __syncthreads(); + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + } + return val; +} +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h new file mode 100644 index 0000000000000..cbfc458abf8da --- /dev/null +++ b/paddle/phi/backends/gpu/musa/musa_helper.h @@ -0,0 +1,33 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace backends { +namespace gpu { + +#define CUDNN_VERSION_MIN(major, minor, patch) \ + (0 >= ((major)*1000 + (minor)*100 + (patch))) + +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = \ + static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ + int64_t __stride__ = static_cast(blockDim.x) * gridDim.x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += __stride__, i = __index__) + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc new file mode 100644 index 0000000000000..f244601b9d9cc --- /dev/null +++ b/paddle/phi/backends/gpu/musa/musa_info.cc @@ -0,0 +1,329 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h" +#include "paddle/phi/backends/gpu/gpu_info.h" + +#include "paddle/phi/core/enforce.h" + +#include "musa_runtime.h" + +static std::once_flag g_device_props_size_init_flag; +static std::vector> g_device_props_init_flags; +static std::vector g_device_props; + +namespace phi { +namespace backends { +namespace gpu { + +int DnnVersion() { + if (!dynload::HasCUDNN()) return -1; + // TODO(@caizhi): mudnnGetVersion is not supported now. + // version info will be returned from mudnnGetVersion later. + const int version_major = 1; + const int version_minor = 1; + const int version_patch = 0; + return version_major * 1000 + version_minor * 100 + version_patch; +} + +static int GetGPUDeviceCountImpl() { + int driverVersion = 0; + musaError_t status = musaDriverGetVersion(&driverVersion); + + if (!(status == gpuSuccess && driverVersion != 0)) { + // No GPU driver + VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; + return 0; + } + + const auto *musa_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES"); + + if (musa_visible_devices != nullptr) { + std::string musa_visible_devices_str(musa_visible_devices); + if (!musa_visible_devices_str.empty()) { + musa_visible_devices_str.erase( + 0, musa_visible_devices_str.find_first_not_of('\'')); + musa_visible_devices_str.erase( + musa_visible_devices_str.find_last_not_of('\'') + 1); + musa_visible_devices_str.erase( + 0, musa_visible_devices_str.find_first_not_of('\"')); + musa_visible_devices_str.erase( + musa_visible_devices_str.find_last_not_of('\"') + 1); + } + if (std::all_of(musa_visible_devices_str.begin(), + musa_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be " + "empty. No GPU detected."; + return 0; + } + } + int count; + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count)); + return count; +} + +int GetGPUDeviceCount() { + // cache the count + static auto dev_cnt = GetGPUDeviceCountImpl(); + return dev_cnt; +} + +int GetGPUComputeCapability(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int major, minor; + auto major_error_code = + musaDeviceGetAttribute(&major, musaDevAttrComputeCapabilityMajor, id); + auto minor_error_code = + musaDeviceGetAttribute(&minor, musaDevAttrComputeCapabilityMinor, id); + + PADDLE_ENFORCE_GPU_SUCCESS(major_error_code); + PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code); + return major * 100 + minor; +} + +int GetGPURuntimeVersion(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int runtime_version = 0; + PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version)); + return runtime_version; +} + +int GetGPUDriverVersion(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int driver_version = 0; + PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version)); + return driver_version; +} + +bool TensorCoreAvailable() { return false; } + +int GetGPUMultiProcessors(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id)); + return count; +} + +int GetGPUMaxThreadsPerMultiProcessor(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute( + &count, musaDevAttrMaxThreadsPerMultiProcessor, id)); + + return count; +} + +int GetGPUMaxThreadsPerBlock(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + int count; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetAttribute(&count, musaDevAttrMaxThreadsPerBlock, id)); + return count; +} + +int GetCurrentDeviceId() { + int device_id; + PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id)); + return device_id; +} + +std::array GetGpuMaxGridDimSize(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + std::array ret; + int size; + auto error_code_x = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); + ret[0] = size; + + auto error_code_y = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); + ret[1] = size; + + auto error_code_z = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id); + PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); + ret[2] = size; + return ret; +} + +std::pair GetGpuStreamPriorityRange() { + int least_priority, greatest_priority; + PADDLE_ENFORCE_GPU_SUCCESS( + musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority)); + return std::make_pair(least_priority, greatest_priority); +} + +const gpuDeviceProp &GetDeviceProperties(int id) { + std::call_once(g_device_props_size_init_flag, [&] { + int gpu_num = 0; + gpu_num = GetGPUDeviceCount(); + g_device_props_init_flags.resize(gpu_num); + g_device_props.resize(gpu_num); + for (int i = 0; i < gpu_num; ++i) { + g_device_props_init_flags[i] = std::make_unique(); + } + }); + + if (id == -1) { + id = GetCurrentDeviceId(); + } + + if (id < 0 || id >= static_cast(g_device_props.size())) { + PADDLE_THROW(phi::errors::OutOfRange( + "The device id %d is out of range [0, %d), where %d is the number of " + "devices on this machine. Because the device id should be greater than " + "or equal to zero and smaller than the number of gpus. Please input " + "appropriate device again!", + id, + static_cast(g_device_props.size()), + static_cast(g_device_props.size()))); + } + + std::call_once(*(g_device_props_init_flags[id]), [&] { + PADDLE_ENFORCE_GPU_SUCCESS( + musaGetDeviceProperties(&g_device_props[id], id)); + }); + + return g_device_props[id]; +} + +void SetDeviceId(int id) { + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); + PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id)); +} + +void GpuMemcpyAsync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind, + gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream)); +} + +void GpuMemcpySync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind)); +} + +void GpuMemcpyPeerAsync(void *dst, + int dst_device, + const void *src, + int src_device, + size_t count, + gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); +} + +void GpuMemcpyPeerSync( + void *dst, int dst_device, const void *src, int src_device, size_t count) { + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemcpyPeer(dst, dst_device, src, src_device, count)); +} + +void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream)); +} + +void GpuStreamSync(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream)); +} + +void GpuDestroyStream(gpuStream_t stream) { + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream)); +} + +void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); } + +gpuError_t GpuGetLastError() { return musaGetLastError(); } + +bool IsGPUManagedMemorySupported(int dev_id) { + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); + return false; +} + +bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) { + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); + return false; +} + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc index 96b46fbc0d4ff..e6163e5f362d3 100644 --- a/paddle/phi/capi/lib/c_device_context.cc +++ b/paddle/phi/capi/lib/c_device_context.cc @@ -35,7 +35,8 @@ PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx, reinterpret_cast(ctx)->stream()); } else if (dev_ctx_type == phi::AllocationType::CPU) { return nullptr; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } else if (dev_ctx_type == phi::AllocationType::GPU) { return reinterpret_cast( reinterpret_cast(ctx)->stream()); diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc index e9fe2aada1f35..63c4085eface4 100644 --- a/paddle/phi/capi/lib/c_kernel_context.cc +++ b/paddle/phi/capi/lib/c_kernel_context.cc @@ -30,7 +30,8 @@ PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) { } else if (dev_ctx_type == phi::AllocationType::CPU) { return reinterpret_cast(const_cast( &kernel_context->GetDeviceContext())); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } else if (dev_ctx_type == phi::AllocationType::GPU) { return reinterpret_cast(const_cast( &kernel_context->GetDeviceContext())); diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 5540592d5013c..342e0a3ebe5ce 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -138,7 +138,8 @@ inline Backend StringToBackend(const char* backend_cstr) { } else if (s == std::string("GPUDNN")) { return Backend::GPUDNN; } else if (s == std::string("KPS")) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // NOTE(chenweihang) KPS is not yet a complete backend, and it still needs // to be converted // to GPU in the GPU environment diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index 7ea9b0cbb6477..d553ac9b1ff0c 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -26,6 +26,14 @@ #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif +#if defined(__MUSACC__) +#define PADDLE_MUSA_BF16 +#include +#endif + #if defined(__CUDACC__) && CUDA_VERSION >= 11000 #define PADDLE_CUDA_BF16 #include @@ -61,6 +69,13 @@ struct PADDLE_ALIGN(2) bfloat16 { tempRes = reinterpret_cast(&val); res = *tempRes; x = res >> 16; +#elif defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_MUSA_BF16) + __mt_bfloat16 tmp = __float2bfloat16(val); + x = *reinterpret_cast(&tmp); +#else + std::memcpy(&x, reinterpret_cast(&val) + 2, 2); +#endif #else #if defined(PADDLE_CUDA_BF16) __nv_bfloat16 tmp = __float2bfloat16(val); @@ -154,6 +169,16 @@ struct PADDLE_ALIGN(2) bfloat16 { uint16_t* temp_ptr = reinterpret_cast(&temp); res = *temp_ptr; return res; +#elif defined(PADDLE_WITH_MUSA) +#ifdef PADDLE_MUSA_BF16 + return __bfloat162float(*reinterpret_cast(&x)); +#else + float val = 0.f; + uint16_t temp = x; + std::memcpy( + reinterpret_cast(&val) + 2, reinterpret_cast(&temp), 2); + return val; +#endif #else #ifdef PADDLE_CUDA_BF16 return __bfloat162float(*reinterpret_cast(&x)); @@ -173,6 +198,12 @@ struct PADDLE_ALIGN(2) bfloat16 { } #endif +#ifdef PADDLE_MUSA_BF16 + HOSTDEVICE inline __mt_bfloat16 to_mt_bfloat16() const { + return *reinterpret_cast(&x); + } +#endif + HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } HOSTDEVICE inline explicit operator int8_t() const { diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h index a4e003dd544ad..c4b8ad9055f87 100644 --- a/paddle/phi/common/complex.h +++ b/paddle/phi/common/complex.h @@ -26,6 +26,11 @@ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include #include // NOLINT @@ -37,7 +42,8 @@ #define PADDLE_ALIGN(x) __declspec(align(x)) #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // todo #define PADDLE_WITH_CUDA_OR_HIP_COMPLEX #endif @@ -62,7 +68,8 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template HOSTDEVICE inline explicit complex(const thrust::complex& c) { @@ -83,6 +90,15 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { HOSTDEVICE inline explicit operator hipDoubleComplex() const { return make_hipDoubleComplex(real, imag); } + +#elif defined(PADDLE_WITH_MUSA) + HOSTDEVICE inline explicit operator muFloatComplex() const { + return make_muFloatComplex(real, imag); + } + + HOSTDEVICE inline explicit operator muDoubleComplex() const { + return make_muDoubleComplex(real, imag); + } #else HOSTDEVICE inline explicit operator cuFloatComplex() const { return make_cuFloatComplex(real, imag); @@ -187,7 +203,7 @@ template HOSTDEVICE inline complex operator+(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::complex(a) + thrust::complex(b)); #else return complex(a.real + b.real, a.imag + b.imag); @@ -198,7 +214,7 @@ template HOSTDEVICE inline complex operator-(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::complex(a) - thrust::complex(b)); #else return complex(a.real - b.real, a.imag - b.imag); @@ -209,7 +225,7 @@ template HOSTDEVICE inline complex operator*(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::complex(a) * thrust::complex(b)); #else return complex(a.real * b.real - a.imag * b.imag, @@ -221,7 +237,7 @@ template HOSTDEVICE inline complex operator/(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::complex(a) / thrust::complex(b)); #else T denominator = b.real * b.real + b.imag * b.imag; @@ -233,7 +249,7 @@ HOSTDEVICE inline complex operator/(const complex& a, template HOSTDEVICE inline complex operator-(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(-thrust::complex(a.real, a.imag)); #else complex res; @@ -247,7 +263,7 @@ template HOSTDEVICE inline complex& operator+=(complex& a, // NOLINT const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) a = complex(thrust::complex(a.real, a.imag) += thrust::complex(b.real, b.imag)); return a; @@ -262,7 +278,7 @@ template HOSTDEVICE inline complex& operator-=(complex& a, // NOLINT const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) a = complex(thrust::complex(a.real, a.imag) -= thrust::complex(b.real, b.imag)); return a; @@ -277,7 +293,7 @@ template HOSTDEVICE inline complex& operator*=(complex& a, // NOLINT const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) a = complex(thrust::complex(a.real, a.imag) *= thrust::complex(b.real, b.imag)); return a; @@ -292,7 +308,7 @@ template HOSTDEVICE inline complex& operator/=(complex& a, // NOLINT const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) a = complex(thrust::complex(a.real, a.imag) /= thrust::complex(b.real, b.imag)); return a; @@ -355,7 +371,7 @@ HOSTDEVICE inline complex(min)(const complex& a, const complex& b) { template HOSTDEVICE inline bool(isnan)(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return ::isnan(a.real) || ::isnan(a.imag); #else return std::isnan(a.real) || std::isnan(a.imag); @@ -365,7 +381,7 @@ HOSTDEVICE inline bool(isnan)(const complex& a) { template HOSTDEVICE inline bool isinf(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return ::isinf(a.real) || ::isinf(a.imag); #else return std::isinf(a.real) || std::isinf(a.imag); @@ -375,7 +391,7 @@ HOSTDEVICE inline bool isinf(const complex& a) { template HOSTDEVICE inline bool isfinite(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return ::isfinite(a.real) || ::isfinite(a.imag); #else return std::isfinite(a.real) || std::isfinite(a.imag); @@ -385,7 +401,7 @@ HOSTDEVICE inline bool isfinite(const complex& a) { template HOSTDEVICE inline T abs(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return thrust::abs(thrust::complex(a)); #else return std::abs(std::complex(a)); @@ -395,7 +411,7 @@ HOSTDEVICE inline T abs(const complex& a) { template HOSTDEVICE inline T arg(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return thrust::arg(thrust::complex(a)); #else return std::arg(std::complex(a)); @@ -405,7 +421,7 @@ HOSTDEVICE inline T arg(const complex& a) { template HOSTDEVICE inline complex pow(const complex& a, const complex& b) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::pow(thrust::complex(a), thrust::complex(b))); #else return complex(std::pow(std::complex(a), std::complex(b))); @@ -415,7 +431,7 @@ HOSTDEVICE inline complex pow(const complex& a, const complex& b) { template HOSTDEVICE inline complex sqrt(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::sqrt(thrust::complex(a))); #else return complex(std::sqrt(std::complex(a))); @@ -425,7 +441,7 @@ HOSTDEVICE inline complex sqrt(const complex& a) { template HOSTDEVICE inline complex tanh(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::tanh(thrust::complex(a))); #else return complex(std::tanh(std::complex(a))); @@ -435,7 +451,7 @@ HOSTDEVICE inline complex tanh(const complex& a) { template HOSTDEVICE inline complex log(const complex& a) { #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)) return complex(thrust::log(thrust::complex(a))); #else return complex(std::log(std::complex(a))); diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h index 6783799026d44..b57b485f43bc4 100644 --- a/paddle/phi/common/cpstring_impl.h +++ b/paddle/phi/common/cpstring_impl.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/phi/core/macros.h" -#if (defined(__NVCC__) || defined(__HIPCC__)) +#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ @@ -77,7 +77,8 @@ HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) { } #endif -#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__)) +#if PD_PSTRING_LITTLE_ENDIAN || \ + (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) #define PD_le32toh(x) x #else // PD_PSTRING_LITTLE_ENDIAN #define PD_le32toh(x) swap32(x) @@ -209,7 +210,7 @@ HOSTDEVICE static inline void *PD_Malloc(size_t size) { return malloc(size); } HOSTDEVICE static inline void *PD_Realloc(void *ptr, size_t old_size UNUSED, size_t new_size) { -#if (defined(__NVCC__) || defined(__HIPCC__)) +#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)) if (old_size >= new_size) { return ptr; } diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 86168d441ded2..75fea3d88ab0c 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -37,6 +37,10 @@ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include #endif @@ -46,6 +50,11 @@ #include #endif +#if defined(__MUSACC__) +#define PADDLE_CUDA_FP16 +#include +#endif + #ifdef __HIPCC__ #define PADDLE_CUDA_FP16 #include @@ -82,8 +91,10 @@ struct PADDLE_ALIGN(2) float16 { // Constructors #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline explicit float16(const half& h) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ + CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(const_cast(&h))->x; #else x = h.x; @@ -101,8 +112,9 @@ struct PADDLE_ALIGN(2) float16 { #endif HOSTDEVICE inline explicit float16(float val) { -#if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) +#if defined(PADDLE_CUDA_FP16) && \ + (defined(__HIPCC__) || defined(__MUSACC__) || \ + (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) half tmp = __float2half(val); x = *reinterpret_cast(&tmp); @@ -144,7 +156,8 @@ struct PADDLE_ALIGN(2) float16 { // Assignment operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline float16& operator=(const half& rhs) { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ + CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(const_cast(&rhs))->x; #else x = rhs.x; @@ -218,7 +231,8 @@ struct PADDLE_ALIGN(2) float16 { // Conversion operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline half to_half() const { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \ + CUDA_VERSION >= 9000 __half_raw h; h.x = x; return half(h); @@ -237,8 +251,9 @@ struct PADDLE_ALIGN(2) float16 { #endif HOSTDEVICE inline operator float() const { -#if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) +#if defined(PADDLE_CUDA_FP16) && \ + (defined(__HIPCC__) || defined(__MUSACC__) || \ + (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) half tmp = *reinterpret_cast(this); return __half2float(tmp); @@ -395,7 +410,7 @@ DEVICE inline half operator-(const half& a) { #endif } -#ifndef PADDLE_WITH_HIP // not defined __HIP_NO_HALF_OPERATORS__ +#ifdef PADDLE_WITH_CUDA // not defined __HIP_NO_HALF_OPERATORS__ DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT a = a + b; return a; @@ -1017,6 +1032,7 @@ struct is_floating_point std::is_same< phi::dtype::float16, typename std::remove_cv::type>::value> {}; + template <> struct is_signed { static const bool value = true; diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc index f9ef606049297..cf4c3ca12869d 100644 --- a/paddle/phi/common/memory_utils.cc +++ b/paddle/phi/common/memory_utils.cc @@ -69,7 +69,8 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) { dev_id); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void GpuMemoryUsage(size_t* available, size_t* total) { return MemoryUtils::Instance().GpuMemoryUsage(available, total); } diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h index f6a4afcea2f78..0aa0c745501ec 100644 --- a/paddle/phi/common/memory_utils.h +++ b/paddle/phi/common/memory_utils.h @@ -118,7 +118,8 @@ struct MemoryInterface { int64_t (*device_memory_stat_current_value)(const std::string& stat_type, int dev_id); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) /** * @brief get the memory usage of current GPU device. * @@ -271,7 +272,8 @@ class MemoryUtils { return memory_method_->device_memory_stat_current_value(stat_type, dev_id); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void GpuMemoryUsage(size_t* available, size_t* total) { CheckMemoryMethod(); PADDLE_ENFORCE_NOT_NULL( @@ -372,7 +374,8 @@ void Copy(const Place& dst_place, int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void GpuMemoryUsage(size_t* available, size_t* total); #endif diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index fe15be4b2b909..0f8d7a173ad52 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -123,7 +123,8 @@ static int8_t GetCorrectDeviceIdByPlaceType( switch (place_type) { case paddle::PlaceType::kCPU: return 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) case paddle::PlaceType::kGPU: return phi::backends::gpu::GetCurrentDeviceId(); #endif @@ -169,7 +170,8 @@ bool operator==(PlaceType place_type, const Place &place) { GPUPlace DefaultGPUPlace() { return GPUPlace( -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::backends::gpu::GetCurrentDeviceId()); #else 0); diff --git a/paddle/phi/common/transform.h b/paddle/phi/common/transform.h index e80561284b885..620d3d683fbf0 100644 --- a/paddle/phi/common/transform.h +++ b/paddle/phi/common/transform.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/hostdevice.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include #include #include "thrust/device_ptr.h" @@ -92,7 +92,7 @@ struct Transform { } }; -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) // PointerToThrustDevicePtr has two specializations, one casts a (CUDA // device) pointer into thrust::device_ptr, the other keeps rest types @@ -153,6 +153,12 @@ struct Transform { CastToCUDATransformIterator(last), CastToCUDATransformIterator(result), op); +#elif defined(__MUSACC__) + thrust::transform(thrust::musa::par.on(context.stream()), + CastToCUDATransformIterator(first), + CastToCUDATransformIterator(last), + CastToCUDATransformIterator(result), + op); #else thrust::transform(thrust::cuda::par.on(context.stream()), CastToCUDATransformIterator(first), @@ -184,6 +190,13 @@ struct Transform { CastToCUDATransformIterator(first2), CastToCUDATransformIterator(result), op); +#elif defined(__MUSACC__) + thrust::transform(thrust::musa::par.on(context.stream()), + CastToCUDATransformIterator(first1), + CastToCUDATransformIterator(last1), + CastToCUDATransformIterator(first2), + CastToCUDATransformIterator(result), + op); #else thrust::transform(thrust::cuda::par.on(context.stream()), CastToCUDATransformIterator(first1), diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 947c7fb45c5fc..24eb8115e970f 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -57,7 +57,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { switch (backend) { case phi::Backend::CPU: return phi::CPUPlace(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) case phi::Backend::GPU: return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); @@ -66,7 +67,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { case phi::Backend::ONEDNN: return phi::CPUPlace(); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) case phi::Backend::GPUDNN: return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); @@ -77,7 +79,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); #endif case phi::Backend::KPS: -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); #elif defined(PADDLE_WITH_XPU_KP) diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h index b27770b081433..26ec22f103a90 100644 --- a/paddle/phi/core/cuda_stream.h +++ b/paddle/phi/core/cuda_stream.h @@ -28,6 +28,11 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif +#ifdef PADDLE_WITH_MUSA +#include +using gpuStream_t = musaStream_t; +#endif + #include "glog/logging.h" #include "paddle/phi/core/enforce.h" @@ -73,6 +78,9 @@ class CUDAStream { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority( &stream, static_cast(flag), priority)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreateWithPriority( + &stream, static_cast(flag), priority)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority( &stream, static_cast(flag), priority)); @@ -92,6 +100,8 @@ class CUDAStream { backends::gpu::GPUDeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(raw_stream())); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(raw_stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(raw_stream())); #endif @@ -112,6 +122,14 @@ class CUDAStream { if (err == hipErrorNotReady) { return false; } +#elif defined(PADDLE_WITH_MUSA) + musaError_t err = musaStreamQuery(raw_stream()); + if (err == musaSuccess) { + return true; + } + if (err == musaErrorNotReady) { + return false; + } #else cudaError_t err = cudaStreamQuery(raw_stream()); if (err == cudaSuccess) { @@ -134,6 +152,8 @@ class CUDAStream { void WaitEvent(gpuEvent_t ev) const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(raw_stream(), ev, 0)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(raw_stream(), ev, 0)); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(raw_stream(), ev, 0)); #endif @@ -146,6 +166,8 @@ class CUDAStream { backends::gpu::GPUDeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP hipStreamDestroy(raw_stream()); +#elif defined(PADDLE_WITH_MUSA) + musaStreamDestroy(raw_stream()); #else cudaStreamDestroy(raw_stream()); #endif diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 6b98fd0488595..4adac10dd658f 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -35,6 +35,17 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include +#include +#include +#include +#include +#include +#include +using mudnnStatus_t = ::musa::dnn::Status; +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include #include @@ -75,6 +86,17 @@ limitations under the License. */ #endif // __APPLE__ #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/dynload/mublas.h" +#include "paddle/phi/backends/dynload/mudnn.h" +#include "paddle/phi/backends/dynload/murand.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +#include + +#include "paddle/phi/backends/dynload/mccl.h" +#endif // __APPLE__ +#endif // PADDLE_WITH_MUSA + #ifdef PADDLE_WITH_HIP #include "paddle/phi/backends/dynload/hipfft.h" #include "paddle/phi/backends/dynload/hiprand.h" @@ -90,7 +112,8 @@ limitations under the License. */ // Note: these headers for simplify demangle type string #include "paddle/phi/core/type_defs.h" // Note: this header for simplify HIP and CUDA type string -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/gpu_types.h" #endif @@ -391,6 +414,17 @@ struct EnforceNotMet : public std::exception { abort(); \ } \ } while (0) +#elif defined(__MUSACC__) +#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...) \ + do { \ + if (!(_IS_NOT_ERROR)) { \ + printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \ + __FILE__, \ + __LINE__, \ + #_IS_NOT_ERROR, \ + ##__VA_ARGS__); \ + } \ + } while (0) #else #define PADDLE_ENFORCE(COND, ...) \ do { \ @@ -830,6 +864,273 @@ inline void retry_sleep(unsigned milliseconds) { #undef DEFINE_EXTERNAL_API_TYPE #endif // PADDLE_WITH_CUDA +/************************************************************************/ +/**************************** MUSA ERROR ********************************/ +#ifdef PADDLE_WITH_MUSA + +namespace details { + +template +struct ExternalApiType {}; + +#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_EXTERNAL_API_TYPE(musaError_t, musaSuccess); +DEFINE_EXTERNAL_API_TYPE(murandStatus_t, MURAND_STATUS_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(mudnnStatus_t, ::musa::dnn::Status::SUCCESS); +DEFINE_EXTERNAL_API_TYPE(mublasStatus_t, MUBLAS_STATUS_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(musparseStatus_t, MUSPARSE_STATUS_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(mufftResult_t, MUFFT_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(MUresult, MUSA_SUCCESS); + +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +DEFINE_EXTERNAL_API_TYPE(mcclResult_t, mcclSuccess); +#endif + +} // namespace details + +/*************** MUSA ERROR ***************/ +inline bool is_error(musaError_t e) { return e != musaSuccess; } + +inline std::string build_musa_error_msg(musaError_t e) { + std::ostringstream sout; + sout << "MUSA error(" << e << "), " << musaGetErrorString(e) << ". "; + return sout.str(); +} + +/*************** MURAND ERROR ***************/ +inline bool is_error(murandStatus_t stat) { + return stat != MURAND_STATUS_SUCCESS; +} + +inline const char* murandGetErrorString(murandStatus_t stat) { + switch (stat) { + case MURAND_STATUS_SUCCESS: + return "MURAND_STATUS_SUCCESS"; + case MURAND_STATUS_VERSION_MISMATCH: + return "MURAND_STATUS_VERSION_MISMATCH"; + case MURAND_STATUS_NOT_CREATED: + return "MURAND_STATUS_NOT_CREATED"; + case MURAND_STATUS_ALLOCATION_FAILED: + return "MURAND_STATUS_ALLOCATION_FAILED"; + case MURAND_STATUS_TYPE_ERROR: + return "MURAND_STATUS_TYPE_ERROR"; + case MURAND_STATUS_OUT_OF_RANGE: + return "MURAND_STATUS_OUT_OF_RANGE"; + case MURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "MURAND_STATUS_LENGTH_NOT_MULTIPLE"; + case MURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "MURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case MURAND_STATUS_LAUNCH_FAILURE: + return "MURAND_STATUS_LAUNCH_FAILURE"; + case MURAND_STATUS_INTERNAL_ERROR: + return "MURAND_STATUS_INTERNAL_ERROR"; + case MURAND_STATUS_NOT_IMPLEMENTED: + return "MURAND_STATUS_NOT_IMPLEMENTED"; + default: + return "Unknown murand status"; + } +} + +inline std::string build_musa_error_msg(murandStatus_t stat) { + std::ostringstream sout; + sout << "MURAND error: " << murandGetErrorString(stat) << "."; + return sout.str(); +} + +/*************** MUBLAS ERROR ***************/ +inline bool is_error(mublasStatus_t stat) { + return stat != MUBLAS_STATUS_SUCCESS; +} + +inline const char* mublasGetErrorString(mublasStatus_t stat) { + switch (stat) { + case MUBLAS_STATUS_SUCCESS: + return "MUBLAS_STATUS_SUCCESS"; + case MUBLAS_STATUS_INVALID_HANDLE: + return "MUBLAS_STATUS_INVALID_HANDLE"; + case MUBLAS_STATUS_NOT_IMPLEMENTED: + return "MUBLAS_STATUS_NOT_IMPLEMENTED"; + case MUBLAS_STATUS_INVALID_POINTER: + return "MUBLAS_STATUS_INVALID_POINTER"; + case MUBLAS_STATUS_INVALID_SIZE: + return "MUBLAS_STATUS_INVALID_SIZE"; + case MUBLAS_STATUS_MEMORY_ERROR: + return "MUBLAS_STATUS_MEMORY_ERROR"; + case MUBLAS_STATUS_INTERNAL_ERROR: + return "MUBLAS_STATUS_INTERNAL_ERROR"; + case MUBLAS_STATUS_PERF_DEGRADED: + return "MUBLAS_STATUS_PERF_DEGRADED"; + case MUBLAS_STATUS_SIZE_QUERY_MISMATCH: + return "MUBLAS_STATUS_SIZE_QUERY_MISMATCH"; + case MUBLAS_STATUS_SIZE_INCREASED: + return "MUBLAS_STATUS_SIZE_INCREASED"; + case MUBLAS_STATUS_SIZE_UNCHANGED: + return "MUBLAS_STATUS_SIZE_UNCHANGED"; + case MUBLAS_STATUS_INVALID_VALUE: + return "MUBLAS_STATUS_INVALID_VALUE"; + case MUBLAS_STATUS_CONTINUE: + return "MUBLAS_STATUS_CONTINUE"; + default: + return "Unknown mublas status"; + } +} +inline std::string build_musa_error_msg(mublasStatus_t stat) { + std::ostringstream sout; + sout << "MUBLAS error: " << mublasGetErrorString(stat) << "."; + return sout.str(); +} + +/*************** MUSPARSE ERROR ***************/ +inline bool is_error(musparseStatus_t stat) { + return stat != MUSPARSE_STATUS_SUCCESS; +} + +inline const char* musparseGetErrorString(musparseStatus_t stat) { + switch (stat) { + case MUSPARSE_STATUS_SUCCESS: + return "MUSPARSE_STATUS_SUCCESSS"; + case MUSPARSE_STATUS_INVALID_HANDLE: + return "MUSPARSE_STATUS_INVALID_HANDLE"; + case MUSPARSE_STATUS_NOT_IMPLEMENTED: + return "MUSPARSE_STATUS_NOT_IMPLEMENTED"; + case MUSPARSE_STATUS_INVALID_POINTER: + return "MUSPARSE_STATUS_INVALID_POINTER"; + case MUSPARSE_STATUS_INVALID_SIZE: + return "MUSPARSE_STATUS_INVALID_SIZE"; + case MUSPARSE_STATUS_MEMORY_ERROR: + return "MUSPARSE_STATUS_MEMORY_ERROR"; + case MUSPARSE_STATUS_INTERNAL_ERROR: + return "MUSPARSE_STATUS_INTERNAL_ERROR"; + case MUSPARSE_STATUS_INVALID_VALUE: + return "MUSPARSE_STATUS_INVALID_VALUE"; + case MUSPARSE_STATUS_ARCH_MISMATCH: + return "MUSPARSE_STATUS_ARCH_MISMATCH"; + case MUSPARSE_STATUS_ZERO_PIVOT: + return "MUSPARSE_STATUS_ZERO_PIVOT"; + case MUSPARSE_STATUS_NOT_INITIALIZED: + return "MUSPARSE_STATUS_NOT_INITIALIZED"; + case MUSPARSE_STATUS_TYPE_MISMATCH: + return "MUSPARSE_STATUS_TYPE_MISMATCH"; + case MUSPARSE_STATUS_REQUIRES_SORTED_STORAGE: + return "MUSPARSE_STATUS_REQUIRES_SORTED_STORAGE"; + default: + return "Unknown musparse status"; + } +} + +inline std::string build_musa_error_msg(musparseStatus_t stat) { + std::ostringstream sout; + sout << "MUSparse error: " << musparseGetErrorString(stat) << "."; + return sout.str(); +} + +/**************** MCCL ERROR ****************/ +#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL) +inline bool is_error(mcclResult_t mccl_result) { + return mccl_result != mcclSuccess; +} + +inline std::string build_musa_error_msg(mcclResult_t mccl_result) { + std::ostringstream sout; + sout << "MCCL error(" << mccl_result << "), " + << phi::dynload::mcclGetErrorString(mccl_result) << ". "; + if (errno == ENOSPC || errno == EAGAIN) { + std::string detail(strerror(errno)); + detail += "\nPlease try one of the following solutions:"; + detail += "\n1. export MCCL_SHM_DISABLE=1;"; + detail += "\n2. export MCCL_P2P_LEVEL=SYS;"; + detail += + "\n3. Increase shared memory by setting the -shm-size " + "option when starting docker container, e.g., setting " + " -shm-size=2g.\n"; + sout << " Detail: " + detail; + } + return sout.str(); +} +#endif // not(__APPLE__) and PADDLE_WITH_MCCL + +#define PADDLE_ENFORCE_GPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = phi::errors::External( \ + ::phi::enforce::build_musa_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#define PADDLE_WARN_GPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + ::phi::enforce::ThrowWarnInternal( \ + ::phi::enforce::build_musa_error_msg(__cond__)); \ + } \ + } while (0) + +#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP) \ + do { \ + auto res = musaGetLastError(); \ + if (UNLIKELY(res != musaSuccess)) { \ + auto msg = ::phi::enforce::build_musa_error_msg(res); \ + PADDLE_THROW( \ + phi::errors::Fatal("MUSA error after kernel (%s): %s", OP, msg)); \ + } \ + } while (0) + +inline void retry_sleep(unsigned milliseconds) { +#ifdef _WIN32 + Sleep(milliseconds); +#else + if (milliseconds < 1000) { + // usleep argument must be less than 1,000,000. Reference: + // https://pubs.opengroup.org/onlinepubs/7908799/xsh/usleep.html + usleep(milliseconds * 1000); + } else { + // clip to sleep in seconds because we can not and don't have to + // sleep for exact milliseconds + sleep(milliseconds / 1000); + } +#endif +} + +#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + int retry_count = 1; \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ + phi::enforce::retry_sleep(10000); \ + __cond__ = (COND); \ + ++retry_count; \ + } \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = phi::errors::External( \ + ::phi::enforce::build_musa_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#undef DEFINE_EXTERNAL_API_TYPE +#endif // PADDLE_WITH_MUSA + /**************************************************************************/ /***************************** HIP ERROR **********************************/ #ifdef PADDLE_WITH_HIP diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index 0c581fb09919f..ec6ac698cf567 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -14,7 +14,8 @@ // limitations under the License. #include "paddle/phi/core/flags.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h" #endif @@ -120,7 +121,8 @@ PHI_DEFINE_EXPORTED_bool( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) /** * CUDA related related FLAG @@ -215,7 +217,8 @@ PHI_DEFINE_EXPORTED_bool( true, "Whether enable api kernel fallback to CPU one when not found"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) /** * CUDNN related FLAG * Name: FLAGS_cudnn_deterministic @@ -322,7 +325,8 @@ PHI_DEFINE_EXPORTED_bool( "batch_norm, default is False."); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) /** * NCCL related FLAG @@ -541,8 +545,9 @@ PHI_DEFINE_EXPORTED_double( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE) || \ + defined(PADDLE_WITH_XPU) /** * Memory related FLAG @@ -785,7 +790,8 @@ PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off, * Example: * Note: Check kernel launch status after every kernel compute. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PHI_DEFINE_EXPORTED_bool( check_kernel_launch, false, @@ -800,7 +806,8 @@ PHI_DEFINE_EXPORTED_bool( * Example: * Note: Disable cudnn in conv2d. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); @@ -1127,7 +1134,8 @@ PHI_DEFINE_EXPORTED_bool(gpugraph_debug_gpu_memory, * Example: * Note: nccl blocking wait. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); #endif diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc index 4ed25af0814df..d5e10d9c2d006 100644 --- a/paddle/phi/core/generator.cc +++ b/paddle/phi/core/generator.cc @@ -63,7 +63,8 @@ const std::shared_ptr& DefaultXPUGenerator(int64_t device_id) { } const std::shared_ptr& DefaultCUDAGenerator(int64_t device_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) static int64_t num_cuda_devices = -1; static std::once_flag num_devices_init_flag; @@ -278,7 +279,8 @@ uint64_t Generator::Random64() { std::pair Generator::IncrementOffset( uint64_t increment_offset) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) std::lock_guard lock(this->mu_); uint64_t cur_offset = this->state_.thread_offset; this->state_.thread_offset += increment_offset; diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h index decebbe66a538..85feb0d060439 100644 --- a/paddle/phi/core/hostdevice.h +++ b/paddle/phi/core/hostdevice.h @@ -18,6 +18,10 @@ #include #endif +#ifdef __MUSACC__ +#include +#endif + #if defined(__xpu__) #include @@ -26,7 +30,8 @@ #include "xpu/kernel/math.h" #endif -#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__)) +#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || \ + defined(__xpu__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index 6511efa0152ee..dc0134da132dc 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -120,7 +120,8 @@ const Kernel& KernelFactory::SelectKernelWithGPUDNN( return empty_kernel; } KernelKey kernel_key = KernelKey(const_kernel_key); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (kernel_key.backend() == Backend::GPUDNN) { auto kernel_iter = iter->second.find( {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()}); @@ -221,7 +222,8 @@ KernelResult KernelFactory::SelectKernelOrThrowError( KernelKey kernel_key = KernelKey(const_kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, const_kernel_key.dtype()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) if (kernel_key.backend() == Backend::GPUDNN) { auto kernel_iter = iter->second.find( {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()}); diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index f4e021f7269a7..ce795bf781577 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -60,7 +60,8 @@ struct KernelArgsParseFunctor { #if defined(PADDLE_WITH_MKLDNN) || arg_type == std::type_index(typeid(const OneDNNContext&)) #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) || arg_type == std::type_index(typeid(const GPUContext&)) #elif defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) || arg_type == std::type_index(typeid(const XPUContext&)) @@ -1401,7 +1402,8 @@ struct KernelRegistrar { meta_kernel_fn, \ BACKEND_LIST) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #define _DEVICE GPU, #elif defined(PADDLE_WITH_XPU) #define _DEVICE XPU, diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index f4dc4636bdde3..1aad3dd59611e 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -273,7 +273,8 @@ struct KernelImpl { /* DeviceContext Helpers */ PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext); #endif #ifdef PADDLE_WITH_XPU diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h index 2e78357492734..f3dae52b04387 100644 --- a/paddle/phi/core/macros.h +++ b/paddle/phi/core/macros.h @@ -53,7 +53,7 @@ namespace phi { #define PD_CONCATENATE2(arg1, arg2) arg1##arg2 #define PD_EXPAND(x) x -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #define PADDLE_RESTRICT __restrict__ #else #define PADDLE_RESTRICT diff --git a/paddle/phi/core/mixed_vector.cc b/paddle/phi/core/mixed_vector.cc index 857bd546befcd..778ec44c28ee3 100644 --- a/paddle/phi/core/mixed_vector.cc +++ b/paddle/phi/core/mixed_vector.cc @@ -33,7 +33,8 @@ template void CopyToCPUHelper(std::vector *cpu_, phi::Allocator::AllocationPtr *gpu_, size_t *gpu_memory_size_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) // COPY GPU Data To CPU auto *dev_ctx = static_cast( phi::DeviceContextPool::Instance().Get((*gpu_)->place())); @@ -55,7 +56,8 @@ void CopyCPUDataToCUDAHelper(std::vector *cpu_, phi::Allocator::AllocationPtr *gpu_, size_t *gpu_memory_size_, const phi::Place &place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) void *src = cpu_->data(); *gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T) (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_); diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc index 0e465982ba429..c8b14db5615ed 100644 --- a/paddle/phi/core/string_tensor.cc +++ b/paddle/phi/core/string_tensor.cc @@ -114,9 +114,12 @@ void StringTensor::init_holder() { if (place.GetType() == phi::AllocationType::CPU) { std::memset(ptr, 0, bytes_size); } else if (place.GetType() == phi::AllocationType::GPU) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #ifdef PADDLE_WITH_HIP hipMemset(ptr, 0, bytes_size); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(ptr, 0, bytes_size); #else cudaMemset(ptr, 0, bytes_size); #endif diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc index abe44d3e2550b..e605673ea78e7 100644 --- a/paddle/phi/core/tensor_utils.cc +++ b/paddle/phi/core/tensor_utils.cc @@ -58,7 +58,8 @@ void Copy(const Context& dev_ctx, #ifdef PADDLE_WITH_MKLDNN dst->set_layout(src.layout()); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } else if (dst_place.GetType() == AllocationType::GPU || dst_place.GetType() == AllocationType::GPUPINNED) { dst_ptr = dev_ctx.Alloc( @@ -99,7 +100,8 @@ void Copy(const Context& dev_ctx, if (src_place.GetType() == AllocationType::CPU && dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) } else if ((src_place.GetType() == AllocationType::CPU || src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT (dst_place.GetType() == AllocationType::CPU || @@ -386,7 +388,8 @@ template void Copy(const DeviceContext& dev_ctx, bool blocking, TensorArray* dst); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template void Copy(const GPUContext& dev_ctx, const DenseTensor& src, Place dst_place, @@ -468,7 +471,8 @@ void TensorFromVector(const std::vector& src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -522,7 +526,8 @@ void TensorFromVector(const std::vector& src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -614,7 +619,8 @@ void TensorFromArray(const T* src, if (dst_place.GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -714,7 +720,8 @@ void TensorToVector(const phi::DenseTensor& src, if (src.place().GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (src.place().GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, @@ -756,7 +763,8 @@ void TensorToVector(const phi::DenseTensor& src, if (src.place().GetType() == AllocationType::CPU) { memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) else if (src.place().GetType() == AllocationType::GPU) { // NOLINT memory_utils::Copy(dst_place, dst_ptr, diff --git a/paddle/phi/core/utils/array.h b/paddle/phi/core/utils/array.h index 44290b73737fb..2ebf2f933b77a 100644 --- a/paddle/phi/core/utils/array.h +++ b/paddle/phi/core/utils/array.h @@ -54,7 +54,7 @@ class Array { } HOSTDEVICE inline T &at(size_t i) { -#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) +#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) && !defined(__MUSACC__) PADDLE_ENFORCE_LT( i, N, phi::errors::OutOfRange("Array index out of bounds.")); #endif @@ -62,7 +62,7 @@ class Array { } HOSTDEVICE inline const T &at(size_t i) const { -#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) +#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) && !defined(__MUSACC__) PADDLE_ENFORCE_LT( i, N, phi::errors::OutOfRange("Array index out of bounds.")); #endif @@ -103,7 +103,7 @@ class Array { HOSTDEVICE inline T *GetMutable() { return nullptr; } HOSTDEVICE inline T &operator[](size_t) { -#if defined(__HIPCC__) || defined(__CUDA_ARCH__) +#if defined(__HIPCC__) || defined(__CUDA_ARCH__) || defined(__MUSA_ARCH__) // HIP and CUDA will have compile error, if use "obj()" // function declared in block scope cannot have 'static' storage class static T obj{}; @@ -114,7 +114,7 @@ class Array { } HOSTDEVICE inline const T &operator[](size_t) const { -#if defined(__HIPCC__) || defined(__CUDA_ARCH__) +#if defined(__HIPCC__) || defined(__CUDA_ARCH__) || defined(__MUSA_ARCH__) // HIP and CUDA will have compile error, if use "obj()" // function declared in block scope cannot have 'static' storage class static const T obj{}; diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc index 2a554525024c8..9a7dc398f2f7f 100644 --- a/paddle/phi/core/utils/type_info.cc +++ b/paddle/phi/core/utils/type_info.cc @@ -61,11 +61,12 @@ template class TypeInfoTraits; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template class TypeInfoTraits; #endif diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h index 6318b17647cd6..874e4ebcaa37b 100644 --- a/paddle/phi/core/utils/visit_place.h +++ b/paddle/phi/core/utils/visit_place.h @@ -25,7 +25,8 @@ typename Visitor::result_type VisitPlace(const phi::Place& place, const Visitor& visitor) { switch (place.GetType()) { case phi::AllocationType::GPU: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::GPUPlace p(place.GetDeviceId()); return visitor(p); #else @@ -35,7 +36,8 @@ typename Visitor::result_type VisitPlace(const phi::Place& place, #endif } case phi::AllocationType::GPUPINNED: { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) phi::GPUPinnedPlace p; return visitor(p); #else diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 71bbfaa333a0a..818a0698069bd 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -858,7 +858,8 @@ void CoalesceTensorInferMeta(const std::vector& input, size_of_dtype = phi::SizeOf(dtype); } if (config.is_runtime) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) int64_t numel = 0; for (size_t i = 0; i < input.size(); ++i) { const auto& dim = input[i]->dims(); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 25367be206139..623e66bc137b8 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -40,6 +40,85 @@ file( "strings/gpu/*.cu" "fusion/gpu/*.cu") +# FIXME(@MTAI): compilation error will occur when compiling the following files. +# This need to be fixed later. +if(WITH_MUSA) + list( + REMOVE_ITEM + kernel_cu + "fusion/gpu/fused_softmax_mask_grad_kernel.cu" + "fusion/gpu/fused_softmax_mask_kernel.cu" + "gpu/batch_norm_grad_kernel.cu" + "gpu/batch_norm_kernel.cu" + "gpu/cholesky_grad_kernel.cu" + "gpu/cholesky_solve_grad_kernel.cu" + "gpu/conv_grad_kernel.cu" + "gpu/conv_kernel.cu" + "gpu/cross_entropy_grad_kernel.cu" + "gpu/cross_entropy_kernel.cu" + "gpu/conv_transpose_grad_kernel.cu" + "gpu/conv_transpose_kernel.cu" + "gpu/cudnn_lstm_grad_kernel.cu" + "gpu/cudnn_lstm_kernel.cu" + "gpu/depthwise_conv_grad_kernel.cu" + "gpu/depthwise_conv_kernel.cu" + "gpu/dist_kernel.cu" + "gpu/elementwise_divide_grad_kernel.cu" + "gpu/elementwise_grad_kernel.cu" + "gpu/elementwise_multiply_grad_kernel.cu" + "gpu/erfinv_kernel.cu" + "gpu/exponential_kernel.cu" + "gpu/fft_grad_kernel.cu" + "gpu/fft_kernel.cu" + "gpu/fused_softmax_mask_grad_kernel.cu" + "gpu/gaussian_kernel.cu" + "gpu/gelu_grad_kernel.cu" + "gpu/gelu_kernel.cu" + "gpu/histogram_kernel.cu" + "gpu/instance_norm_grad_kernel.cu" + "gpu/instance_norm_kernel.cu" + "gpu/interpolate_grad_kernel.cu" + "gpu/kthvalue_grad_kernel.cu" + "gpu/kthvalue_kernel.cu" + "gpu/layer_norm_grad_kernel.cu" + "gpu/layer_norm_kernel.cu" + "gpu/llm_int8_mat_mul_kernel.cu" + "gpu/log_softmax_grad_kernel.cu" + "gpu/log_softmax_kernel.cu" + "gpu/lstsq_kernel.cu" + "gpu/nanmedian_kernel.cu" + "gpu/rnn_grad_kernel.cu.cc" + "gpu/rnn_kernel.cu.cc" + "gpu/slogdeterminant_grad_kernel.cu" + "gpu/softmax_grad_kernel.cu" + "gpu/softmax_kernel.cu" + "gpu/solve_grad_kernel.cu" + "gpu/solve_kernel.cu" + "gpu/spectral_norm_grad_kernel.cu" + "gpu/spectral_norm_kernel.cu" + "gpu/stft_kernel.cu" + "gpu/svd_grad_kernel.cu" + "gpu/top_k_grad_kernel.cu" + "gpu/top_k_kernel.cu" + "gpu/truncated_gaussian_random_kernel.cu" + "gpudnn/affine_grid_grad_kernel.cu" + "gpudnn/affine_grid_kernel.cu" + "gpudnn/softmax_grad_kernel.cu" + "gpudnn/softmax_kernel.cu" + "gpudnn/conv_grad_kernel.cu" + "gpudnn/conv_kernel.cu" + "gpudnn/conv_transpose_grad_kernel.cu" + "gpudnn/conv_transpose_kernel.cu" + "gpudnn/pool_grad_kernel.cu" + "gpudnn/pool_kernel.cu" + "sparse/gpu/softmax_grad_kernel.cu" + "sparse/gpu/softmax_kernel.cu" + "sparse/gpu/conv_kernel.cu" + "sparse/gpu/pool_kernel.cu" + "strings/gpu/strings_copy_kernel.cu" + "strings/gpu/strings_lower_upper_kernel.cu") +endif() + if(APPLE OR WIN32) list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu") endif() @@ -117,7 +196,9 @@ file( "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc" "sparse/xpu/*.cc") -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU + OR WITH_ROCM + OR WITH_MUSA) collect_srcs(kernels_srcs SRCS ${kernel_cu}) kernel_declare("${kernel_cu}") endif() diff --git a/paddle/phi/kernels/activation_kernel.cc b/paddle/phi/kernels/activation_kernel.cc index f157c5e054bfb..9dffd348ec62b 100644 --- a/paddle/phi/kernels/activation_kernel.cc +++ b/paddle/phi/kernels/activation_kernel.cc @@ -32,7 +32,8 @@ using complex128 = ::phi::dtype::complex; PD_REGISTER_KERNEL(relu6, CPU, ALL_LAYOUT, phi::Relu6Kernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(relu6, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc index db30ec7389619..73fc6b4100cb4 100644 --- a/paddle/phi/kernels/assign_kernel.cc +++ b/paddle/phi/kernels/assign_kernel.cc @@ -135,7 +135,8 @@ PD_REGISTER_KERNEL(assign_value, int8_t, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 87eca2613a7b5..3817e62791c47 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -23,6 +23,9 @@ #ifdef PADDLE_WITH_HIP #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#endif namespace phi { @@ -32,6 +35,9 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventCreate(&start_); hipEventCreate(&stop_); +#elif defined(PADDLE_WITH_MUSA) + musaEventCreate(&start_); + musaEventCreate(&stop_); #else cudaEventCreate(&start_); cudaEventCreate(&stop_); @@ -46,6 +52,9 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventDestroy(start_); hipEventDestroy(stop_); +#elif defined(PADDLE_WITH_MUSA) + musaEventDestroy(start_); + musaEventDestroy(stop_); #else cudaEventDestroy(start_); cudaEventDestroy(stop_); @@ -55,6 +64,8 @@ class GpuTimer { void Start(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP hipEventRecord(start_, stream); +#elif defined(PADDLE_WITH_MUSA) + musaEventRecord(start_, stream); #else cudaEventRecord(start_, stream); #endif @@ -63,6 +74,8 @@ class GpuTimer { void Stop(gpuStream_t stream) { #ifdef PADDLE_WITH_HIP hipEventRecord(stop_, stream); +#elif defined(PADDLE_WITH_MUSA) + musaEventRecord(stop_, stream); #else cudaEventRecord(stop_, stream); #endif @@ -73,6 +86,9 @@ class GpuTimer { #ifdef PADDLE_WITH_HIP hipEventSynchronize(stop_); hipEventElapsedTime(&milliseconds, start_, stop_); +#elif defined(PADDLE_WITH_MUSA) + musaEventSynchronize(stop_); + musaEventElapsedTime(&milliseconds, start_, stop_); #else cudaEventSynchronize(stop_); cudaEventElapsedTime(&milliseconds, start_, stop_); diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc index 6e496a355302f..661b287071fc5 100644 --- a/paddle/phi/kernels/check_memory_continue_kernel.cc +++ b/paddle/phi/kernels/check_memory_continue_kernel.cc @@ -88,7 +88,8 @@ PD_REGISTER_KERNEL(check_memory_continue, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(check_memory_continue, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc index 8dcd3a1d995d8..58cacd21bba18 100644 --- a/paddle/phi/kernels/coalesce_tensor_kernel.cc +++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc @@ -292,7 +292,7 @@ PD_REGISTER_KERNEL(coalesce_tensor, } #endif -#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(coalesce_tensor, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc index 65ee3c1851003..81ed7170d7a24 100644 --- a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc @@ -64,7 +64,7 @@ struct GeluGradFunctor { } else { #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) auto x_data = x.data(); auto dx_data = dx.data(); auto dout_data = dout.data(); diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc index dbab3bd326664..47ab1a7839066 100644 --- a/paddle/phi/kernels/cpu/gelu_kernel.cc +++ b/paddle/phi/kernels/cpu/gelu_kernel.cc @@ -53,7 +53,7 @@ struct GeluFunctor { } else { #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) auto x_data = x.data(); auto out_data = out.data(); int n = std::min(x.size(), out.size()); diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc index 17c24fa905b5c..638efeb4e3257 100644 --- a/paddle/phi/kernels/dist_grad_kernel.cc +++ b/paddle/phi/kernels/dist_grad_kernel.cc @@ -97,7 +97,8 @@ void DistGradKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL( dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} #endif diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index 8df5e9a543eb2..3bc8ad34ac951 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -74,7 +74,8 @@ PD_REGISTER_KERNEL(empty_like, kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(empty, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc index 42d137ba4f419..476cfc810acf8 100644 --- a/paddle/phi/kernels/flatten_grad_kernel.cc +++ b/paddle/phi/kernels/flatten_grad_kernel.cc @@ -46,7 +46,8 @@ PD_REGISTER_KERNEL(flatten_grad, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(flatten_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc index dc61e6a650efa..0c6c9b3ec2d9a 100644 --- a/paddle/phi/kernels/flatten_kernel.cc +++ b/paddle/phi/kernels/flatten_kernel.cc @@ -75,7 +75,8 @@ PD_REGISTER_KERNEL(flatten, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(flatten_infer, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc index 38beafbfa51b9..8817d577f7c8d 100644 --- a/paddle/phi/kernels/full_kernel.cc +++ b/paddle/phi/kernels/full_kernel.cc @@ -49,7 +49,8 @@ PD_REGISTER_KERNEL(full_batch_size_like, bool) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) PD_REGISTER_KERNEL(full_batch_size_like, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 999625cf3dfb4..3a2b1f276bbbb 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -8,11 +8,20 @@ file( GLOB func_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -if(WITH_GPU OR WITH_ROCM) +if(WITH_GPU + OR WITH_ROCM + OR WITH_MUSA) file( GLOB func_cu_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu") endif() +# TODO(@MTAI): compilation error will occur when compiling the following files. +# Compiler mcc need fix this bug. +if(WITH_MUSA) + list(REMOVE_ITEM func_cu_srcs "cross_entropy.cu" "gru_compute.cu" + "softmax.cu") +endif() + collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs}) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 203f6837d4611..a43300056161b 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -2566,7 +2566,8 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \ + defined(__xpu__) template struct CudaLogitFunctor : public BaseActivationFunctor { diff --git a/paddle/phi/kernels/funcs/algorithm.h b/paddle/phi/kernels/funcs/algorithm.h index 5f66f6f1abd4d..49daa32412674 100644 --- a/paddle/phi/kernels/funcs/algorithm.h +++ b/paddle/phi/kernels/funcs/algorithm.h @@ -40,7 +40,8 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) { template HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) { -#if defined(__CUDA_ARCH__) || defined(__HIPCC__) // @{ Group LowerBound +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || \ + defined(__MUSACC__) // @{ Group LowerBound // The following code is from // https://en.cppreference.com/w/cpp/algorithm/lower_bound auto *first = x; @@ -63,7 +64,8 @@ HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) { template HOSTDEVICE inline size_t UpperBound(const T1 *x, size_t num, const T2 &val) { -#if defined(__CUDA_ARCH__) || defined(__HIPCC__) // @{ Group UpperBound +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || \ + defined(__MUSACC__) // @{ Group UpperBound // The following code is from // https://en.cppreference.com/w/cpp/algorithm/upper_bound auto *first = x; diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h index 140eca890480f..b1e492d65b4a1 100644 --- a/paddle/phi/kernels/funcs/blas/blas.h +++ b/paddle/phi/kernels/funcs/blas/blas.h @@ -175,7 +175,8 @@ class Blas { T* c, const int* ldc) const; -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \ + !defined(PADDLE_WITH_MUSA) template void MatMulWithHead(const phi::DenseTensor& mat_a, const MatDescriptor& dim_a, @@ -303,7 +304,7 @@ class Blas { int batchCount) const; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) template void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, @@ -360,7 +361,8 @@ class Blas { T* B, int ldb) const; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const; @@ -445,7 +447,8 @@ class BlasT : private Blas { Base()->template CSRMM(args...); } -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \ + !defined(PADDLE_WITH_MUSA) template void MatMulWithHead(ARGS... args) const { Base()->template MatMulWithHead(args...); @@ -543,7 +546,8 @@ class BlasT : private Blas { Base()->template TRSM(args...); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) template void BatchedGETRF(ARGS... args) const { Base()->template BatchedGETRF(args...); @@ -593,3 +597,6 @@ inline BlasT GetBlas(const DeviceContext& dev_ctx) { #ifdef PADDLE_WITH_HIP #include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h" #endif +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/kernels/funcs/blas/blas_impl.mu.h" +#endif diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index ffafe15b8fcf2..f570a48eeb5b7 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -1452,7 +1452,8 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, } #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) // @{ Group Blas MKLML: BatchedGEMMWithHead + !defined(PADDLE_WITH_HIP) && \ + !defined(PADDLE_WITH_MUSA) // @{ Group Blas MKLML: BatchedGEMMWithHead template <> template void Blas::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, @@ -1698,7 +1699,7 @@ void Blas::MatMul(const T *mat_a, } #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) + !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) // @{ Group Blas MKLML: MatMulWithHead /* * Multiple two matrixes with multiple heads diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h new file mode 100644 index 0000000000000..44098d9d090c6 --- /dev/null +++ b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h @@ -0,0 +1,357 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__MUSACC__) +#include +#endif +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/flags.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +PHI_DECLARE_bool(enable_cublas_tensor_op_math); +PHI_DECLARE_bool(gemm_use_half_precision_compute_type); + +namespace phi { +namespace funcs { + +template <> +template +void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + T alpha, + const T *A, + const T *B, + T beta, + T *C) const {} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::float16 alpha, + const phi::dtype::float16 *A, + const phi::dtype::float16 *B, + phi::dtype::float16 beta, + phi::dtype::float16 *C) const {} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + const phi::dtype::bfloat16 *B, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C) const {} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::complex alpha, + const phi::dtype::complex *A, + const phi::dtype::complex *B, + phi::dtype::complex beta, + phi::dtype::complex *C) const {} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::complex alpha, + const phi::dtype::complex *A, + const phi::dtype::complex *B, + phi::dtype::complex beta, + phi::dtype::complex *C) const {} + +template <> +template +void Blas::GEMM(bool transA, + bool transB, + int M, + int N, + int K, + T alpha, + const T *A, + int lda, + const T *B, + int ldb, + T beta, + T *C, + int ldc) const {} + +template <> +template <> +inline void Blas::GEMM(bool transA, + bool transB, + int M, + int N, + int K, + phi::dtype::float16 alpha, + const phi::dtype::float16 *A, + int lda, + const phi::dtype::float16 *B, + int ldb, + phi::dtype::float16 beta, + phi::dtype::float16 *C, + int ldc) const {} + +template <> +template <> +inline void Blas::GEMM(bool transA, + bool transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + int lda, + const phi::dtype::bfloat16 *B, + int ldb, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C, + int ldc) const {} + +template <> +template +void Blas::AXPY(int n, T alpha, const T *x, T *y) const {} + +template <> +template +void Blas::SCAL(int n, const T alpha, T *x) const {} + +template <> +template +void Blas::VCOPY(int n, const T *x, T *y) const {} + +template <> +template +void Blas::GEMV(bool trans_a, + int M, + int N, + T alpha, + const T *A, + const T *B, + T beta, + T *C) const {} + +template <> +template <> +inline void Blas::GEMV(bool trans_a, + int M, + int N, + phi::dtype::float16 alpha, + const phi::dtype::float16 *A, + const phi::dtype::float16 *B, + phi::dtype::float16 beta, + phi::dtype::float16 *C) const {} + +template <> +template <> +inline void Blas::GEMV(bool trans_a, + int M, + int N, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + const phi::dtype::bfloat16 *B, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C) const {} + +template <> +template +void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + T alpha, + const T *A, + const T *B, + T beta, + T *C, + int batchCount, + int64_t strideA, + int64_t strideB) const {} + +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + const phi::dtype::bfloat16 *B, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C, + int batchCount, + int64_t strideA, + int64_t strideB) const {} + +template <> +template +void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + T alpha, + const T **A, + const T **B, + T beta, + T **C, + int batchCount) const {} + +#if defined(__MUSACC__) +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + double alpha, + const double **A, + const double **B, + double beta, + double **C, + int batchCount) const {} + +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + float alpha, + const float **A, + const float **B, + float beta, + float **C, + int batchCount) const {} + +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::float16 alpha, + const phi::dtype::float16 **A, + const phi::dtype::float16 **B, + phi::dtype::float16 beta, + phi::dtype::float16 **C, + int batchCount) const {} + +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 **A, + const phi::dtype::bfloat16 **B, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 **C, + int batchCount) const {} +#endif + +template <> +template +void Blas::TRSM(CBLAS_SIDE side, + CBLAS_UPLO uplo, + CBLAS_TRANSPOSE transA, + CBLAS_DIAG diag, + int M, + int N, + T alpha, + const T *A, + int lda, + T *B, + int ldb) const {} + +template <> +template +void Blas::BatchedGETRF( + int n, T **a, int *ipiv, int *info, int batch_size) const {} + +template <> +template +void Blas::BatchedGETRI(int n, + const T **a, + const int *ipiv, + T **a_inv, + int *info, + int batch_size) const {} + +template <> +template +void Blas::BatchedMatInv( + int n, const T **a, T **a_inv, int *info, int batch_size) const {} + +template <> +template +void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, + int n, + int nrhs, + const T **a, + int lda, + int *ipiv, + T **b, + int ldb, + int *info, + int batch_size) const {} + +template <> +template +void Blas::BatchedTRSM(CBLAS_SIDE side, + CBLAS_UPLO uplo, + CBLAS_TRANSPOSE transA, + CBLAS_DIAG diag, + int M, + int N, + T alpha, + const T **A, + int lda, + T **B, + int ldb, + int batch_size) const {} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index e754ce3bf49e4..b1732b44373c7 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -17,7 +17,8 @@ limitations under the License. */ #include #include "paddle/phi/kernels/funcs/elementwise_base.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \ + defined(__xpu__) #include "paddle/phi/kernels/funcs/dims_simplifier.h" namespace kps = phi::kps; @@ -27,7 +28,8 @@ namespace kps = phi::kps; namespace phi { namespace funcs { -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \ + defined(__xpu__) enum BroadcastLoadType { kMixed = 1, kBroadcast = 2, kElementwise = 3 }; diff --git a/paddle/phi/kernels/funcs/check_numerics_utils.h b/paddle/phi/kernels/funcs/check_numerics_utils.h index 473d7994058a8..7f618fa3b3f33 100644 --- a/paddle/phi/kernels/funcs/check_numerics_utils.h +++ b/paddle/phi/kernels/funcs/check_numerics_utils.h @@ -86,7 +86,7 @@ HOSTDEVICE static void PrintAndThrowError(const char* debug_info, int64_t num_nan, int64_t num_inf, int64_t num_zero) { -#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__) +#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__) && !defined(__MUSA_ARCH__) PADDLE_THROW(phi::errors::PreconditionNotMet( "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in " "%s.", diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 5a7574b56a891..3086d5dc4ed14 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -21,6 +21,10 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/kernels/funcs/segmented_array.h" +#ifdef PADDLE_WITH_MUSA +#include "paddle/phi/backends/gpu/musa/musa_helper.h" +#endif + namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h index 0cd07fdfd0e1a..707b203e9f49b 100644 --- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h +++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h @@ -17,7 +17,8 @@ limitations under the License. */ #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/device_context.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) #include "paddle/phi/backends/gpu/gpu_context.h" #endif @@ -41,7 +42,8 @@ struct StridedMemcpyFunctor { auto& cpu_place = place; memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T)); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(dev_ctx); memory_utils::Copy( @@ -68,7 +70,8 @@ struct StridedMemcpyFunctor { memory_utils::Copy( cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_MUSA) auto& gpu_place = place; auto& cuda_ctx = reinterpret_cast(dev_ctx); memory_utils::Copy(gpu_place, diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h index a30fb79f8c8b0..f0235f0baec5f 100644 --- a/paddle/phi/kernels/funcs/diagonal.h +++ b/paddle/phi/kernels/funcs/diagonal.h @@ -14,7 +14,7 @@ #pragma once -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include #include @@ -109,7 +109,7 @@ DenseTensor Diagonal(const DeviceContext& context, int64_t pos = std::abs(offset) * offset_stride; int64_t dim_size = ret_strides.size(); -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) thrust::device_vector diag_vec(vectorize(dig_stride)); const int64_t* diag_arr = thrust::raw_pointer_cast(diag_vec.data()); thrust::device_vector ret_vec(ret_strides); @@ -146,7 +146,7 @@ std::vector ComputeDimStride(const std::vector dim) { return dim_strides; } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template __global__ void DiagonalCuda(const T* data1, T* data2, diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index abade7ac0ef87..4705370f71f7c 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -17,6 +17,9 @@ limitations under the License. */ #ifdef __NVCC__ #include #endif +#ifdef __MUSACC__ +#include +#endif #ifdef __HIPCC__ #include #endif @@ -28,7 +31,7 @@ limitations under the License. */ #include "paddle/phi/core/generator.h" #include "paddle/phi/core/hostdevice.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) #include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" #endif @@ -49,7 +52,7 @@ struct exponential_transform { explicit exponential_transform(T lambda) : lambda_(lambda) {} HOSTDEVICE inline T operator()(T val) const { -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) T log = -std::numeric_limits::epsilon() / 2; if (val < static_cast(1.) - std::numeric_limits::epsilon() / 2) { if (std::is_same::value) { @@ -113,7 +116,7 @@ struct normal_transform { T std_; }; -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) namespace kps = phi::kps; @@ -186,6 +189,69 @@ struct normal_distribution { static constexpr int kReturnsCount = 2; }; +#elif defined(__MUSACC__) +template +struct uniform_distribution { + __device__ inline T operator()(murand_state_philox4x32_10 *state) const { + return static_cast(murand_uniform(state)); + } + static constexpr int kReturnsCount = 1; +}; + +template <> +struct uniform_distribution { + __device__ inline float4 operator()(murand_state_philox4x32_10 *state) const { + return murand_uniform4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline double2 operator()( + murand_state_philox4x32_10 *state) const { + return murand_uniform2_double(state); + } + static constexpr int kReturnsCount = 2; +}; + +template <> +struct uniform_distribution { + __device__ inline uint4 operator()(murand_state_philox4x32_10 *state) const { + return murand4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline ulonglong2 operator()( + murand_state_philox4x32_10 *state) const { + ulonglong2 result; + uint4 rand = murand4(state); + result.x = (uint64_t)rand.x << 32 | rand.y; + result.y = (uint64_t)rand.z << 32 | rand.w; + return result; + } + static constexpr int kReturnsCount = 2; +}; + +template <> +struct normal_distribution { + __device__ inline float4 operator()(murand_state_philox4x32_10 *state) const { + return murand_normal4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct normal_distribution { + __device__ inline double2 operator()( + murand_state_philox4x32_10 *state) const { + return murand_normal2_double(state); + } + static constexpr int kReturnsCount = 2; +}; #else template struct uniform_distribution { @@ -268,6 +334,10 @@ __global__ void DistributionKernel(size_t size, curandStatePhilox4_32_10_t state; curand_init(seed, idx + THREAD_ID_X, offset, &state); using SType = curandStatePhilox4_32_10_t; +#elif defined(__MUSACC__) + murand_state_philox4x32_10 state; + murand_init(seed, idx + THREAD_ID_X, offset, &state); + using SType = murand_state_philox4x32_10; #else hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, offset, &state); diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h index a1fc2c225ecf2..d31ab7f3c1c12 100644 --- a/paddle/phi/kernels/funcs/dropout_impl.cu.h +++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h @@ -24,6 +24,10 @@ limitations under the License. */ #include #include #endif +#ifdef PADDLE_WITH_MUSA +#include +#include +#endif #include "paddle/phi/kernels/funcs/dropout_impl_util.h" @@ -142,6 +146,10 @@ __global__ void VectorizedRandomGenerator(const size_t n, hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, increment, &state); using SType = hiprandStatePhilox4_32_10_t; +#elif defined(PADDLE_WITH_MUSA) + murand_state_philox4x32_10 state; + murand_init(seed, idx + THREAD_ID_X, increment, &state); + using SType = murand_state_philox4x32_10; #else curandStatePhilox4_32_10_t state; curand_init(seed, idx + THREAD_ID_X, increment, &state); @@ -212,6 +220,10 @@ __global__ void VectorizedGeneratorMask(const size_t n, hiprandStatePhilox4_32_10_t state; hiprand_init(seed, idx + THREAD_ID_X, increment, &state); using SType = hiprandStatePhilox4_32_10_t; +#elif defined(PADDLE_WITH_MUSA) + murand_state_philox4x32_10 state; + murand_init(seed, idx + THREAD_ID_X, increment, &state); + using SType = murand_state_philox4x32_10; #else curandStatePhilox4_32_10_t state; curand_init(seed, idx + THREAD_ID_X, increment, &state); @@ -295,6 +307,11 @@ void DropoutFwGPUKernelDriver( hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); PADDLE_ENFORCE_GPU_SUCCESS( hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream)); +#elif defined(PADDLE_WITH_MUSA) + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + musaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream)); #else PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); @@ -430,6 +447,8 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, if (upscale_in_train && dropout_prob == 1.0f) { #ifdef PADDLE_WITH_HIP hipMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); +#elif defined(PADDLE_WITH_MUSA) + musaMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); #else cudaMemset(grad_x->data(), 0, grad_x->numel() * sizeof(T)); #endif diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 274ac1cc32c05..08d59cc2569d4 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -22,7 +22,8 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/elementwise_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \ + defined(__xpu__) #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/function_traits.h" @@ -151,7 +152,7 @@ class MidWiseTransformIterator int64_t post_; }; -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) template class RowwiseTransformIterator : public thrust::iterator_adaptor, @@ -486,7 +487,8 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout, } } -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \ + defined(__xpu__) // static unroller template