diff --git a/.gitmodules b/.gitmodules
index 3ea3f4a0903a0..eb37796f51f0a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -52,7 +52,8 @@
 	ignore = dirty
 [submodule "third_party/eigen3"]
 	path = third_party/eigen3
-	url = https://gitlab.com/libeigen/eigen.git
+	url = https://gitlab.com/paipinuo233/eigen.git
+	branch = support_musa
 	ignore = dirty
 [submodule "third_party/snappy"]
 	path = third_party/snappy
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d66288ac48580..2855a0dbe674f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -97,7 +97,7 @@ repos:
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
         args:
             - --extensions=c,cc,cxx,cpp,cu,cuh,h,hpp,hxx,kps
-            - --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens
+            - --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens,-whitespace/braces,-build/include
             - --quiet
         # Exclude third-party libraries
         exclude:  |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 632cf33100c7e..dfbe22ea13911 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,7 @@ option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF)
 option(WITH_XPU_PLUGIN "Compile PaddlePaddle with BAIDU XPU plugin" OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
 option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
+option(WITH_MUSA "Compile PaddlePaddle with MUSA platform" OFF)
 option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
 option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
 option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
@@ -89,6 +90,9 @@ endif()
 if(WITH_GPU AND WITH_ROCM)
   message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
+if(WITH_GPU AND WITH_MUSA)
+  message(FATAL_ERROR "Error when compile CUDA and MUSA at the same time")
+endif()
 
 if(WITH_GPU AND NOT APPLE)
   enable_language(CUDA)
@@ -346,6 +350,7 @@ if(LINUX
    AND NOT WITH_CUSTOM_DEVICE
    AND NOT WITH_GPU
    AND NOT WITH_ROCM
+   AND NOT WITH_MUSA
    AND NOT WITH_XPU
    AND NOT WITH_XPU_KP
    AND NOT WITH_XPU_XFT
@@ -503,6 +508,31 @@ else()
   endif()
 endif()
 
+if(WITH_MUSA)
+  include(musa)
+  include(mudnn)
+endif()
+
+if(NOT WITH_MUSA AND WITH_MCCL)
+  message(
+    WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.")
+  set(WITH_MCCL
+      OFF
+      CACHE STRING "Disable MCCL when compiling without MUSA" FORCE)
+endif()
+
+if(WITH_MCCL)
+  add_definitions("-DPADDLE_WITH_MCCL")
+  include(mccl)
+else()
+  if(WITH_MUSA)
+    message(
+      WARNING
+        "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used."
+    )
+  endif()
+endif()
+
 if(WITH_HETERPS AND WITH_PSLIB)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index dc661fce388fe..35e78b01b9bbe 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -175,6 +175,13 @@ elseif(WITH_ROCM)
   if(${MIOPEN_VERSION} VERSION_LESS 2090)
     message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile")
   endif()
+elseif(WITH_MUSA)
+  add_definitions(-DPADDLE_WITH_MUSA)
+  add_definitions(-DEIGEN_USE_GPU)
+  add_definitions(-DEIGEN_USE_MUSA)
+  if(NOT MUDNN_FOUND)
+    message(FATAL_ERROR "Paddle needs MUDNN to compile")
+  endif()
 else()
   add_definitions(-DHPPL_STUB_FUNC)
   list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 06e37b3c8a602..a981007ba5aa5 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 # update eigen to the commit id f612df27 on 03/16/2021
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
-set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee)
+set(EIGEN_TAG 6ad1f10acbc311dd82b20cce7f5c305ae8c3eaa9)
 set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/eigen3)
 
 if(WIN32)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 947d44950d52b..56c9c0de2f24b 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -425,6 +425,9 @@ function(cc_binary TARGET_NAME)
   if(WITH_ROCM)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
+  if(WITH_MUSA)
+    target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB})
+  endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
 
@@ -452,6 +455,9 @@ function(cc_test_build TARGET_NAME)
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
+    if(WITH_MUSA)
+      target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB})
+    endif()
     check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
   endif()
 endfunction()
@@ -775,6 +781,111 @@ function(hip_test TARGET_NAME)
   endif()
 endfunction()
 
+function(musa_library TARGET_NAME)
+  if(WITH_MUSA)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(musa_library "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    if(musa_library_SRCS)
+      if(musa_library_SHARED OR musa_library_shared) # build *.so
+        musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS})
+      else()
+        musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS})
+        find_fluid_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
+      endif()
+      if(musa_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${musa_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${musa_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${musa_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND musa_library_HEADERS
+               ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+    else()
+      if(musa_library_DEPS)
+        list(REMOVE_DUPLICATES musa_library_DEPS)
+        generate_dummy_static_lib(
+          LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR
+          "generic.cmake:musa_library")
+
+        target_link_libraries(${TARGET_NAME} ${musa_library_DEPS})
+        add_dependencies(${TARGET_NAME} ${musa_library_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in musa_library.")
+      endif()
+    endif()
+  endif()
+endfunction()
+
+function(musa_binary TARGET_NAME)
+  if(WITH_MUSA)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(musa_binary "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${musa_binary_SRCS})
+    if(musa_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${musa_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${musa_binary_DEPS})
+      common_link(${TARGET_NAME})
+    endif()
+  endif()
+endfunction()
+
+function(musa_test TARGET_NAME)
+  if(WITH_MUSA AND WITH_TESTING)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(musa_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    musa_add_executable(${TARGET_NAME} ${musa_test_SRCS})
+    # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
+    target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(
+      ${TARGET_NAME}
+      ${musa_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      glog
+      phi
+      ${os_dependency_modules})
+    add_dependencies(
+      ${TARGET_NAME}
+      ${musa_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      phi
+      glog)
+    common_link(${TARGET_NAME})
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cpu_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cudnn_deterministic=true)
+    set_property(
+      TEST ${TARGET_NAME}
+      PROPERTY
+        ENVIRONMENT
+        "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH"
+    )
+  endif()
+endfunction()
+
 function(xpu_library TARGET_NAME)
   if(WITH_XPU_KP)
     set(options STATIC static SHARED shared)
@@ -1274,6 +1385,15 @@ function(math_library TARGET)
       ${TARGET}
       SRCS ${cc_srcs} ${cu_srcs}
       DEPS ${math_library_DEPS} ${math_common_deps})
+  elseif(WITH_MUSA)
+    musa_library(
+      ${TARGET}
+      SRCS
+      ${cc_srcs}
+      ${cu_srcs}
+      DEPS
+      ${math_library_DEPS}
+      ${math_common_deps})
   elseif(${cc_srcs_len} GREATER 0)
     cc_library(
       ${TARGET}
diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake
new file mode 100644
index 0000000000000..fd32500458161
--- /dev/null
+++ b/cmake/mccl.cmake
@@ -0,0 +1,52 @@
+if(NOT WITH_MUSA)
+  return()
+endif()
+
+# Now we don't support MCCL on windows
+if(WIN32)
+  return()
+endif()
+
+if(WITH_MCCL)
+  set(MCCL_ROOT
+      "/usr/local/musa/"
+      CACHE PATH "MCCL ROOT")
+  find_path(
+    MCCL_INCLUDE_DIR mccl.h
+    PATHS ${MCCL_ROOT} ${MCCL_ROOT}/include ${MCCL_ROOT}/local/include
+          $ENV{MCCL_ROOT} $ENV{MCCL_ROOT}/include $ENV{MCCL_ROOT}/local/include
+    NO_DEFAULT_PATH)
+
+  if(MCCL_INCLUDE_DIR)
+    file(READ ${MCCL_INCLUDE_DIR}/mccl.h MCCL_VERSION_FILE_CONTENTS)
+
+    string(REGEX MATCH "define MCCL_MAJOR +([0-9]+)" MCCL_MAJOR_VERSION
+                 "${MCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MCCL_MAJOR +([0-9]+)" "\\1" MCCL_MAJOR_VERSION
+                         "${MCCL_MAJOR_VERSION}")
+    string(REGEX MATCH "define MCCL_MINOR +([0-9]+)" MCCL_MINOR_VERSION
+                 "${MCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MCCL_MINOR +([0-9]+)" "\\1" MCCL_MINOR_VERSION
+                         "${MCCL_MINOR_VERSION}")
+    string(REGEX MATCH "define MCCL_PATCH +([0-9]+)" MCCL_PATCH_VERSION
+                 "${MCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MCCL_PATCH +([0-9]+)" "\\1" MCCL_PATCH_VERSION
+                         "${MCCL_PATCH_VERSION}")
+    if(NOT MCCL_MAJOR_VERSION)
+      set(MCCL_VERSION "???")
+    else()
+      math(EXPR MCCL_VERSION "${MCCL_MAJOR_VERSION} * 1000 +
+                 ${MCCL_MINOR_VERSION} * 100 + ${MCCL_PATCH_VERSION}")
+    endif()
+    add_definitions("-DMCCL_VERSION_CODE=$MCCL_VERSION")
+    include_directories(${MCCL_INCLUDE_DIR})
+
+    message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. ")
+    message(
+      STATUS
+        "Current MCCL version is "
+        "v${MCCL_MAJOR_VERSION}.${MCCL_MINOR_VERSION}.${MCCL_PATCH_VERSION} ")
+  else()
+    message(FATAL_ERROR "WITH_MCCL is enabled but mccl.h file is not found!")
+  endif()
+endif()
diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake
new file mode 100644
index 0000000000000..81027890d144e
--- /dev/null
+++ b/cmake/mudnn.cmake
@@ -0,0 +1,92 @@
+if(NOT WITH_MUSA)
+  return()
+endif()
+
+if(WIN32)
+  return()
+else()
+  set(MUDNN_ROOT
+      "/usr/local/musa"
+      CACHE PATH "MUDNN ROOT")
+endif()
+
+find_path(
+  MUDNN_INCLUDE_DIR mudnn.h
+  PATHS ${MUDNN_ROOT} ${MUDNN_ROOT}/include $ENV{MUDNN_ROOT}
+        $ENV{MUDNN_ROOT}/include ${MUSA_TOOLKIT_INCLUDE}
+  NO_DEFAULT_PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(
+  APPEND
+  MUDNN_CHECK_LIBRARY_DIRS
+  ${MUDNN_ROOT}
+  ${MUDNN_ROOT}/lib64
+  ${MUDNN_ROOT}/lib
+  ${MUDNN_ROOT}/lib/x64
+  ${MUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+  $ENV{MUDNN_ROOT}
+  $ENV{MUDNN_ROOT}/lib64
+  $ENV{MUDNN_ROOT}/lib
+  $ENV{MUDNN_ROOT}/lib/x64
+  /usr/lib
+  ${MUSA_TOOLKIT_ROOT_DIR}
+  ${MUSA_TOOLKIT_ROOT_DIR}/lib/x64)
+set(MUDNN_LIB_NAME "")
+
+if(LINUX)
+  set(MUDNN_LIB_NAME "libmudnn.so")
+endif()
+
+find_library(
+  MUDNN_LIBRARY
+  NAMES ${MUDNN_LIB_NAME}
+  PATHS ${MUDNN_CHECK_LIBRARY_DIRS} ${MUDNN_INCLUDE_DIR}
+  NO_DEFAULT_PATH
+  DOC "Path to muDNN library.")
+
+if(MUDNN_INCLUDE_DIR AND MUDNN_LIBRARY)
+  set(MUDNN_FOUND ON)
+else()
+  set(MUDNN_FOUND OFF)
+endif()
+
+macro(find_mudnn_version mudnn_version_file)
+  file(READ ${mudnn_version_file} MUDNN_VERSION_FILE_CONTENTS)
+  get_filename_component(MUDNN_LIB_PATH ${MUDNN_LIBRARY} DIRECTORY)
+
+  string(REGEX MATCH "define MUDNN_VERSION_MAJOR +([0-9]+)" MUDNN_MAJOR_VERSION
+               "${MUDNN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MUDNN_VERSION_MAJOR +([0-9]+)" "\\1"
+                       MUDNN_MAJOR_VERSION "${MUDNN_MAJOR_VERSION}")
+  string(REGEX MATCH "define MUDNN_VERSION_MINOR +([0-9]+)" MUDNN_MINOR_VERSION
+               "${MUDNN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MUDNN_VERSION_MINOR +([0-9]+)" "\\1"
+                       MUDNN_MINOR_VERSION "${MUDNN_MINOR_VERSION}")
+  string(REGEX MATCH "define MUDNN_VERSION_PATCH +([0-9]+)" MUDNN_PATCH_VERSION
+               "${MUDNN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MUDNN_VERSION_PATCH +([0-9]+)" "\\1"
+                       MUDNN_PATCH_VERSION "${MUDNN_PATCH_VERSION}")
+
+  if(NOT MUDNN_MAJOR_VERSION)
+    set(MUDNN_VERSION "???")
+  else()
+    add_definitions("-DMUDNN_MAJOR_VERSION=\"${MUDNN_MAJOR_VERSION}\"")
+    math(EXPR MUDNN_VERSION "${MUDNN_MAJOR_VERSION} * 1000 +
+               ${MUDNN_MINOR_VERSION} * 100 + ${MUDNN_PATCH_VERSION}")
+    message(STATUS "Current muDNN version file is ${mudnn_version_file} ")
+    message(
+      STATUS
+        "Current muDNN version is v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}.${MUDNN_PATCH_VERSION}. "
+    )
+  endif()
+endmacro()
+
+if(MUDNN_FOUND)
+  find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn_version.h)
+  include_directories(${MUDNN_INCLUDE_DIR})
+endif()
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
new file mode 100644
index 0000000000000..fa1268fbce02b
--- /dev/null
+++ b/cmake/musa.cmake
@@ -0,0 +1,123 @@
+if(NOT WITH_MUSA)
+  return()
+endif()
+
+if(NOT DEFINED ENV{MUSA_PATH})
+  set(MUSA_PATH
+      "/usr/local/musa"
+      CACHE PATH "Path to which ROCm has been installed")
+else()
+  set(MUSA_PATH
+      $ENV{MUSA_PATH}
+      CACHE PATH "Path to which ROCm has been installed")
+endif()
+set(CMAKE_MODULE_PATH "${MUSA_PATH}/cmake" ${CMAKE_MODULE_PATH})
+
+find_package(MUSA REQUIRED)
+include_directories(${MUSA_PATH}/include)
+
+# set openmp include directory
+set(llvm_openmp_search_list)
+foreach(item RANGE 6 20 1)
+  list(APPEND llvm_openmp_search_list /usr/lib/llvm-${item}/include/openmp/)
+endforeach()
+
+find_path(
+  OPENMP_INCLUDE_DIR omp.h
+  PATHS ${llvm_openmp_search_list} REQUIRED
+  NO_DEFAULT_PATH)
+include_directories(${OPENMP_INCLUDE_DIR})
+
+macro(find_musa_version musa_version_file)
+  set(python_file ${PROJECT_BINARY_DIR}/get_version.py)
+  set(MUSA_VERSION
+      "None"
+      CACHE STRING "musa version" FORCE)
+  file(
+    WRITE ${python_file}
+    ""
+    "import json\n"
+    "import sys\n"
+    "with open(sys.argv[1], 'r') as f:\n"
+    "    data = json.load(f)\n"
+    "    print(data[\"MUSA_RUNTIME\"][\"version\"])"
+    "")
+
+  execute_process(
+    COMMAND "python" "${python_file}" ${musa_version_file}
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+    RESULT_VARIABLE python_res
+    OUTPUT_VARIABLE python_out
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(python_res EQUAL 0)
+    set(MUSA_VERSION ${python_out})
+  endif()
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\1" MUSA_MAJOR_VERSION
+                       "${MUSA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\2" MUSA_MINOR_VERSION
+                       "${MUSA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\3" MUSA_PATCH_VERSION
+                       "${MUSA_VERSION}")
+
+  if(NOT MUSA_MAJOR_VERSION)
+    set(MUSA_VERSION "???")
+    message(WARNING "Cannot find MUSA version in ${MUSA_PATH}/version.json")
+  else()
+    math(
+      EXPR
+      MUSA_VERSION
+      "${MUSA_MAJOR_VERSION} * 10000 + ${MUSA_MINOR_VERSION} * 100   + ${MUSA_PATCH_VERSION}"
+    )
+    message(STATUS "Current MUSA version file is ${MUSA_PATH}/version.json.")
+    message(
+      STATUS
+        "Current MUSA version is v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION} "
+    )
+  endif()
+endmacro()
+find_musa_version(${MUSA_PATH}/version.json)
+
+list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined)
+list(APPEND MUSA_MCC_FLAGS -Wno-deprecated-copy-with-user-provided-copy)
+list(APPEND MUSA_MCC_FLAGS -Wno-pragma-once-outside-header)
+list(APPEND MUSA_MCC_FLAGS -Wno-return-type)
+list(APPEND MUSA_MCC_FLAGS -Wno-sign-compare)
+list(APPEND MUSA_MCC_FLAGS -Wno-mismatched-tags)
+list(APPEND MUSA_MCC_FLAGS -Wno-pessimizing-move)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-but-set-variable)
+list(APPEND MUSA_MCC_FLAGS -Wno-bitwise-instead-of-logical)
+list(APPEND MUSA_MCC_FLAGS -Wno-format)
+list(APPEND MUSA_MCC_FLAGS -Wno-self-assign)
+list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion)
+list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-value)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture)
+list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor)
+list(APPEND MUSA_MCC_FLAGS -Wno-braced-scalar-init)
+list(APPEND MUSA_MCC_FLAGS -Wno-pass-failed)
+list(APPEND MUSA_MCC_FLAGS -Wno-missing-braces)
+list(APPEND MUSA_MCC_FLAGS -Wno-dangling-gsl)
+
+if(WITH_CINN)
+  list(APPEND MUSA_MCC_FLAGS -std=c++14)
+else()
+  list(APPEND MUSA_MCC_FLAGS -std=c++17)
+endif()
+
+list(APPEND MUSA_MCC_FLAGS --cuda-gpu-arch=mp_21)
+list(APPEND MUSA_MCC_FLAGS -U__CUDA__)
+# MUSA has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer
+list(APPEND MUSA_MCC_FLAGS -D__MUSA_NO_HALF_CONVERSIONS__)
+
+#set(MUSA_VERBOSE_BUILD ON)
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+  list(APPEND MUSA_MCC_FLAGS -g2)
+  list(APPEND MUSA_MCC_FLAGS -O0)
+endif()
+
+set(musa_runtime_library_name musart)
+find_library(MUSARTC_LIB ${musa_runtime_library_name} HINTS ${MUSA_PATH}/lib)
+message(STATUS "MUSARTC_LIB: ${MUSARTC_LIB}")
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index 3c234c6b93326..c160c2834abbd 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -103,8 +103,8 @@ function(kernel_declare TARGET_LIST)
           set(first_registry "")
         endif()
       endif()
-      # some gpu kernel only can run on cuda, not support rocm, so we add this branch
-      if(WITH_ROCM)
+      # some gpu kernel only can run on cuda, not support rocm and musa, so we add this branch
+      if(WITH_ROCM OR WITH_MUSA)
         string(FIND "${first_registry}" "cuda_only" pos)
         if(pos GREATER 1)
           set(first_registry "")
diff --git a/cmake/version.cmake b/cmake/version.cmake
index e6707665a3851..6b1905352bbad 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -86,12 +86,19 @@ function(version version_file)
     "WITH_MKLDNN: ${WITH_MKLDNN}\n"
     "WITH_GPU: ${WITH_GPU}\n"
     "WITH_ROCM: ${WITH_ROCM}\n"
+    "WITH_MUSA: ${WITH_MUSA}\n"
     "WITH_IPU: ${WITH_IPU}\n")
   if(WITH_GPU)
     file(APPEND ${version_file}
          "CUDA version: ${CUDA_VERSION}\n"
          "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n")
   endif()
+  if(WITH_MUSA)
+    file(
+      APPEND ${version_file}
+      "MUSA version: v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION}\n"
+      "MUDNN version: v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}\n")
+  endif()
   if(WITH_ROCM)
     file(APPEND ${version_file}
          "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n"
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 82d99a3835230..06c27f1d205c1 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -272,7 +272,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
   int64_t max_memory_size = framework::GetEagerDeletionThreshold();
   std::shared_ptr<framework::GarbageCollector> gc;
   if (max_memory_size >= 0) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(place)) {
       if (framework::IsFastEagerDeletionModeEnabled()) {
         gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
index 2e3389af5feb5..02955f46018f6 100644
--- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
@@ -71,7 +71,8 @@ bool CondInterceptor::GetCondResult() {
   const auto& cond_tensor = cond_var->Get<phi::DenseTensor>();
   bool res = false;
   if (platform::is_gpu_place(cond_tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     phi::DenseTensor cpu_tensor;
     framework::TensorCopy(cond_tensor, platform::CPUPlace(), &cpu_tensor);
     platform::DeviceContextPool::Instance().Get(cond_tensor.place())->Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 4836d656d180f..4b59290c2b87a 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -76,7 +76,8 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
                 input_data.data.length());
   } else if (platform::is_gpu_place(place)) {
     VLOG(3) << "Loading data for GPU.";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
     auto gpu_place = place;
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 7567236c4ff68..d3c0df2a11595 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -132,7 +132,8 @@ void ScaleAPI(const paddle::Tensor& x,
                                          bias_after_scale,
                                          dense_out.get());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
     auto* dev_ctx =
         dynamic_cast<phi::GPUContext*>(pool.Get(expected_kernel_place));
diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
index 7fe53febc5a9b..b96b997976be4 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
@@ -124,7 +124,7 @@ def FindParsingFunctionFromAttributeType(atype):
 FUNCTION_SET_DEVICE_TEMPLATE = """{}
     SetPythonStack();
     if (paddle::platform::is_gpu_place(place)) {{
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::backends::gpu::SetDeviceId(place.device);
       VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device;
 #else
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index a66bc211d513c..03d0bfbf5ed23 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -98,7 +98,8 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
 
     auto& place = dense_tensor->place();
     if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       paddle::framework::details::tensor_check<phi::GPUContext>(
           api_name, tensor_name, *dense_tensor, place);
 #else
diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h
index 1620c99ce8560..aca4ce5f23d8c 100644
--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -45,6 +45,8 @@ class ConvSearchCache {
   AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetConvFusion() {
     return &fusion_forward_cache_;
   }
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
   AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetForward() {
     return &forward_cache_;
@@ -72,6 +74,8 @@ class ConvSearchCache {
   AlgorithmsCache<miopenConvBwdDataAlgorithm_t> backward_data_cache_;
   AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t> backward_filter_cache_;
   AlgorithmsCache<miopenConvFwdAlgorithm_t> fusion_forward_cache_;
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
   AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_;
   AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> backward_data_cache_;
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index 10e0b76f00459..b2f389bb965a0 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -32,7 +32,8 @@ namespace framework {
 static std::vector<platform::Place> CreatePlaceList() {
   std::vector<platform::Place> places;
   places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   places.emplace_back(platform::CUDAPlace(0));
 #endif
   return places;
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index ebfed9a6f73f6..44cfbf77ea6c2 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -123,7 +123,8 @@ static void RunKernelFunc(
                 "Input tensor (%s) is not initialized.", in_name));
         paddle::Tensor custom_in;
         custom_in.set_impl(std::make_shared<phi::DenseTensor>(*x));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         if (custom_in.is_gpu_pinned()) {
           VLOG(3) << "Custom Operator: custom input is gpu pinned tensor";
           auto gpu_place = phi::GPUPlace(platform::GetCurrentDeviceId());
@@ -1174,7 +1175,8 @@ static void RegisterOperatorKernel(
   }
   RegisterOperatorKernelWithPlace(
       name, op_kernel_func, proto::VarType::RAW, platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   RegisterOperatorKernelWithPlace(
       name, op_kernel_func, proto::VarType::RAW, platform::CUDAPlace());
 #endif
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 32c4845bd0d57..bd03b7cf4c59c 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -1526,7 +1526,9 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
 #endif
 }
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
 template <typename T>
 void PrivateInstantDataFeed<T>::PutToFeedVec() {
   for (size_t i = 0; i < use_slots_.size(); ++i) {
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 1057640842c2c..b3ba9b7fd4fdd 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -1951,7 +1951,9 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
   int pv_batch_size_;
 };
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
 template <typename T>
 class PrivateInstantDataFeed : public DataFeed {
  public:
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index e058b19469000..887de75181709 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -70,7 +70,9 @@ REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
 REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
 REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed);
 REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed);
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
 REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 9d114fcf56396..b2fb089f53574 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -101,7 +101,7 @@ struct CastDataType {
             in_end,
             out_begin,
             CastDataTypeFunctor<InType, OutType>());
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     } else if (platform::is_gpu_place(in_.place())) {
       phi::Transform<phi::GPUContext> trans;
       auto* context = static_cast<const phi::GPUContext*>(ctx_);
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 4d9a88cf22372..9f146d960b026 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -202,6 +202,94 @@ elseif(WITH_ROCM)
     fused_broadcast_op_handle
     SRCS fused_broadcast_op_handle.cc
     DEPS broadcast_op_handle)
+elseif(WITH_MUSA)
+  musa_library(
+    nan_inf_utils
+    SRCS
+    nan_inf_utils_detail.cc
+    DEPS
+    framework_proto
+    scope
+    place
+    phi)
+  musa_library(
+    all_reduce_op_handle
+    SRCS
+    all_reduce_op_handle.cc
+    DEPS
+    op_handle_base
+    scope
+    lod_tensor
+    phi
+    memory
+    dynload_cuda
+    variable_visitor)
+  musa_library(
+    fused_all_reduce_op_handle
+    SRCS
+    fused_all_reduce_op_handle.cc
+    DEPS
+    all_reduce_op_handle
+    op_handle_base
+    variable_visitor
+    scope
+    lod_tensor
+    phi
+    memory
+    dynload_cuda
+    place)
+  musa_library(
+    grad_merge_all_reduce_op_handle
+    SRCS
+    grad_merge_all_reduce_op_handle.cc
+    DEPS
+    fused_all_reduce_op_handle
+    op_handle_base
+    scope
+    lod_tensor
+    phi
+    memory
+    dynload_cuda
+    variable_visitor
+    place
+    all_reduce_op_handle)
+
+  if(WITH_DISTRIBUTE)
+    musa_library(
+      reduce_op_handle
+      SRCS
+      reduce_op_handle.cc
+      DEPS
+      op_handle_base
+      variable_visitor
+      scope
+      phi
+      dynload_cuda)
+  else()
+    musa_library(
+      reduce_op_handle
+      SRCS
+      reduce_op_handle.cc
+      DEPS
+      op_handle_base
+      variable_visitor
+      scope
+      phi
+      dynload_cuda)
+  endif()
+  musa_library(
+    broadcast_op_handle
+    SRCS
+    broadcast_op_handle.cc
+    DEPS
+    op_handle_base
+    scope
+    phi
+    memory
+    variable_visitor
+    dynload_cuda)
+  musa_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS
+               broadcast_op_handle)
 else()
   cc_library(
     nan_inf_utils
@@ -386,7 +474,9 @@ endif()
 
 if(NOT APPLE
    AND NOT WIN32
-   AND (WITH_GPU OR WITH_ROCM))
+   AND (WITH_GPU
+        OR WITH_ROCM
+        OR WITH_MUSA))
   set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
 cc_library(
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 69f7a49ce55fd..73707458c073a 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -186,7 +186,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
                         "fuse_relu_depthwise_conv_pass");
     AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
     AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass");
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #endif
@@ -545,7 +546,8 @@ USE_PASS(fused_feedforward_pass);
 #ifdef PADDLE_WITH_MKLDNN
 USE_PASS(mkldnn_placement_pass);
 #endif
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
     !defined(_WIN32) && !defined(__APPLE__)
 USE_PASS(fusion_group_pass);
 #endif
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 4012263f688cb..2c4b73d73b56d 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,7 +16,8 @@
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include <algorithm>
@@ -44,7 +45,8 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       place_(place),
       var_infos_(vars.begin(), vars.end()),
       gc_(gc) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
@@ -53,6 +55,9 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event_, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventCreateWithFlags(&event_, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
@@ -75,12 +80,15 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 }
 
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (event_) {
     auto gpu_place = dev_ctx_->GetPlace();
     platform::CUDADeviceGuard guard(gpu_place.device);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
@@ -89,7 +97,8 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 }
 
 void EagerDeletionOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
@@ -177,7 +186,8 @@ void EagerDeletionOpHandle::RunImpl() {
 
 void EagerDeletionOpHandle::ClearGarbages(
     std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (event_) {
     auto compute_stream = dev_ctx_->stream();
     auto callback_stream =
@@ -187,6 +197,10 @@ void EagerDeletionOpHandle::ClearGarbages(
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamWaitEvent(callback_stream, event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaStreamWaitEvent(callback_stream, event_, 0));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -197,7 +211,8 @@ void EagerDeletionOpHandle::ClearGarbages(
   } else {
 #endif
     gc_->Add(std::move(*garbages));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   }
 #endif
 }
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index 0a92269c50ad2..e08267938b822 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -80,7 +80,8 @@ class EagerDeletionOpHandle : public OpHandleBase {
   std::vector<ir::MemOptVarInfo *> var_infos_;  // not own
   GarbageCollector *gc_;                        // not own
   std::vector<Variable *> vars_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   phi::GPUContext *dev_ctx_{nullptr};
   gpuEvent_t event_{nullptr};
 #endif
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index 9fd6a08e02302..f14bb44a76cd4 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -135,7 +135,8 @@ static void TransData(const phi::DenseTensor *src_item,
                       const platform::DeviceContext &ctx) {
   if (src_item->IsInitialized() && src_item->numel() > 0) {
     if (platform::is_gpu_place(src_item->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index a36b63da9b8b6..d522791c13875 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -121,7 +121,8 @@ static void TransData(const phi::DenseTensor &src_item,
                       phi::DenseTensor *dst_item) {
   if (src_item.IsInitialized() && src_item.numel() > 0) {
     if (platform::is_gpu_place(src_item.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 29d5697b23f0d..f3ad442609171 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -32,7 +32,8 @@ typedef std::vector<
     std::vector<std::pair<std::string, const phi::DenseTensor *>>>
     GradientAndLoDTensor;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_MCCL)
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
     ir::Node *node,
     const std::vector<Scope *> &local_scopes,
@@ -61,11 +62,14 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
 #endif
 
 FusedAllReduceOpHandle::~FusedAllReduceOpHandle() {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_MCCL)
   auto destroy_event = [](gpuEvent_t event) {
     if (event == nullptr) return;
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
@@ -103,6 +107,9 @@ void FusedAllReduceOpHandle::RunImpl() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(event, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventCreateWithFlags(event, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(event, cudaEventDisableTiming));
@@ -126,6 +133,10 @@ void FusedAllReduceOpHandle::RunImpl() {
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(nccl_stream, start_event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(start_event_, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaStreamWaitEvent(nccl_stream, start_event_, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -185,12 +196,17 @@ void FusedAllReduceOpHandle::RunImpl() {
     FusedAllReduceFunc(in_var_handles, out_var_handles);
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_MCCL)
   if (FLAGS_allreduce_record_one_event) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(compute_stream, end_event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(end_event_, nccl_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaStreamWaitEvent(compute_stream, end_event_, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 3437eb5570dc7..1f0895ae7dd30 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -47,7 +47,8 @@ struct TestGatherOpHandle {
 
   void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       int count = p::GetGPUDeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -224,7 +225,8 @@ TEST(GatherTester, TestCPUGatherTestSelectedRows) {
   test_op.TestGatherSelectedRows(input_scope_idx);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 TEST(GatherTester, TestGPUGatherTestSelectedRows) {
   TestGatherOpHandle test_op;
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 80c029a5fd976..fa3e72ab75cd1 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -183,7 +183,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,
            << ", place:" << tensor->place() << ", numel:" << tensor->numel();
 
   if (platform::is_gpu_place(tensor->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 82f09f51c23e1..c880e6abf5b1c 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -31,11 +31,14 @@ std::string OpHandleBase::DebugString() const {
 }
 
 OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   for (auto &ev : events_) {
     if (ev.second) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
@@ -45,13 +48,17 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
 }
 
 void OpHandleBase::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   for (auto &p : dev_ctxes_) {
     int dev_id = p.first.device;
     platform::SetDeviceId(dev_id);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaEventCreateWithFlags(&events_[dev_id], musaEventDisableTiming));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
@@ -136,7 +143,8 @@ void OpHandleBase::InitXPU() {
 }
 
 void OpHandleBase::Run(DeviceType use_device) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) {
     InitCUDA();
   }
@@ -172,7 +180,8 @@ void OpHandleBase::Run(DeviceType use_device) {
 }
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   PADDLE_ENFORCE_NOT_NULL(
       waited_ctx,
       platform::errors::InvalidArgument("Argument waited_ctx is NULL."));
@@ -188,6 +197,8 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
     for (auto &ev : events_) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream, ev.second, 0));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
 #endif
@@ -221,12 +232,16 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
       if (in_var_handle) {
         auto &place = in_var_handle->place();
         if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
           auto stream =
               static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
 #ifdef PADDLE_WITH_HIP
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#elif defined(PADDLE_WITH_MUSA)
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
@@ -248,7 +263,8 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
         if (in_var_handle) {
           auto &place = in_var_handle->place();
           if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
             platform::DeviceContextPool &pool =
                 platform::DeviceContextPool::Instance();
             auto stream =
@@ -273,13 +289,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
       auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
       if (in_var_handle) {
         if (platform::is_gpu_place(in_var_handle->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
           auto stream = static_cast<phi::GPUContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
 #ifdef PADDLE_WITH_HIP
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#elif defined(PADDLE_WITH_MUSA)
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
@@ -311,7 +331,8 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
 
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
   callback();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (!events_.empty()) {  // Use event
     for (auto &p : dev_ctxes_) {
       auto dev_id = p.first.device;
@@ -320,6 +341,9 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
@@ -331,7 +355,8 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                      const std::function<void()> &callback) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_cpu_place(p) || events_.empty()) {
     callback();
   } else {
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 9afe56e4babd4..b9411082e2dce 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -161,7 +161,8 @@ class OpHandleBase {
   // See https://github.com/PaddlePaddle/Paddle/pull/32283
   bool is_variant_scope_ = false;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::unordered_map<int, gpuEvent_t> events_;
 #endif
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 7587fb6553cd7..bb9fbd605aeca 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -303,7 +303,8 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) {
   test_op.InitReduceOp(out_scope_idx);
   test_op.TestReduceLodTensors(out_scope_idx);
 }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
   TestReduceOpHandle test_op;
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 9dac1a7203f8d..11490d85d183f 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -76,7 +76,8 @@ struct ScaleLossGradFunctor {
           "Please recompile or reinstall Paddle with XPU support."));
 #endif
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       OutT cast_coeff = static_cast<OutT>(coeff_);
       auto stream = static_cast<phi::GPUContext *>(ctx_)->stream();
       memory::Copy(place_,
@@ -110,7 +111,8 @@ void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
   auto *tensor = var->GetMutable<phi::DenseTensor>();
   tensor->Resize(phi::make_ddim({1}));
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   ScaleLossGradFunctor func(
       coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_));
   if (record_event) {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index 02a68fb697efb..dce9d9ab621bb 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -95,7 +95,8 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype(
 }
 
 void ShareTensorBufferOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index a6314220d5c26..c78267882aaaf 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -129,7 +129,8 @@ struct VarHandle : public VarHandleBase {
         name_(std::move(name)),
         place_(std::move(place)) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   bool HasEvent() { return has_event_; }
 
   const gpuEvent_t& GetEvent() {
@@ -154,7 +155,8 @@ struct VarHandle : public VarHandleBase {
   size_t scope_idx_;
   std::string name_;
   platform::Place place_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // Only when this event is triggered, var is generated.
   gpuEvent_t event_;
   bool has_event_{false};
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 25d29e469a498..3b3a51b234de4 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -84,12 +84,13 @@ class PullDenseWorker {
  public:
   virtual ~PullDenseWorker() {}
   virtual void Initialize(const TrainerDesc& param);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); }
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU)
   void AddPlace(const paddle::platform::Place place) {
     places_.push_back(place);
   }
@@ -154,7 +155,8 @@ class PullDenseWorker {
   float total_batch_num_ = 0;
   std::unordered_map<const Scope*, int> scope_to_thread_id_;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::vector<gpuStream_t> copy_streams_;
 #endif
   std::vector<paddle::platform::Place> places_;
@@ -185,7 +187,8 @@ class DeviceWorker {
   virtual void ProduceTasks() {}
   virtual void GetXpuOpIndex() {}
   virtual void Schedule(int taskid UNUSED) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   virtual void SetStream(const gpuStream_t stream UNUSED) {}
   virtual void SetEvent(const gpuEvent_t event UNUSED) {}
 #endif
@@ -561,7 +564,8 @@ class PSGPUWorker : public HogwildWorker {
     new (&program_) ProgramDesc(main_program);
   }
   void ProduceTasks() override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
   virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
 #endif
@@ -629,7 +633,8 @@ class PSGPUWorker : public HogwildWorker {
   std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   gpuEvent_t event_;
   gpuStream_t copy_stream_;
 #endif
@@ -802,7 +807,8 @@ class HeterSectionWorker : public DeviceWorker {
   Scope* GetThreadScope() override { return minibatch_scope_; }
 
   // multi-stream
-  // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||
+  // defined(PADDLE_WITH_MUSA)
   //  void SetStream(const gpuStream_t stream) override {}
   //  void SetEvent(const gpuEvent_t event) override {}
   // #endif
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 46b917cda740a..a55e640c0be32 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -96,7 +96,8 @@ struct DLDeviceVisitor {
   }
 
   inline ::DLDevice operator()(const platform::CUDAPlace &place) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     ::DLDevice device;
     device.device_type = kDLGPU;
     device.device_id = place.device;
@@ -108,7 +109,8 @@ struct DLDeviceVisitor {
   }
 
   inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     ::DLDevice device;
     device.device_type = kDLCPUPinned;
     device.device_id = 0;
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index f6b28b0a22ebc..af3368de4dc8e 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -108,7 +108,8 @@ void TestToDLManagedTensor(const platform::Place &place, uint16_t lanes) {
 
 template <typename T>
 void TestMainLoop() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::vector<platform::Place> places{platform::CPUPlace(),
                                       platform::CUDAPlace(0),
                                       platform::CUDAPinnedPlace()};
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index e0ad2255743c4..b3f8525998257 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -492,7 +492,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
   std::unique_ptr<GarbageCollector> gc;
   if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
     if (platform::is_gpu_place(place_)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
       } else {
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index 5f46906cf8e82..e370631443d56 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -161,6 +161,11 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
             values.data(),
             values.size() * sizeof(float*),
             hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(gpu_values,
+             values.data(),
+             values.size() * sizeof(float*),
+             musaMemcpyHostToDevice);
 #else
   cudaMemcpy(gpu_values,
              values.data(),
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index 9853c328cd14e..b3432277805a7 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -595,6 +595,9 @@ class BoxWrapper {
       data->resize(len);
 #ifdef PADDLE_WITH_HIP
       hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemcpy(
+          data->data(), gpu_data, sizeof(T) * len, musaMemcpyDeviceToHost);
 #else
       cudaMemcpy(
           data->data(), gpu_data, sizeof(T) * len, cudaMemcpyDeviceToHost);
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index d72e418aadd3e..cba6da070ac55 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -44,7 +44,9 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
     int device_id = place.GetDeviceId();
     phi::DenseTensor& total_keys_tensor = keys_tensor[device_id];
@@ -70,6 +72,15 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
               slot_lengths_lod.data(),
               slot_lengths.size() * sizeof(int64_t),
               hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemcpy(gpu_keys,
+               keys.data(),
+               keys.size() * sizeof(uint64_t*),
+               musaMemcpyHostToDevice);
+    musaMemcpy(gpu_len,
+               slot_lengths_lod.data(),
+               slot_lengths.size() * sizeof(int64_t),
+               musaMemcpyHostToDevice);
 #else
     cudaMemcpy(gpu_keys,
                keys.data(),
@@ -153,7 +164,9 @@ void BoxWrapper::PushSparseGradCase(
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
     int device_id = place.GetDeviceId();
     phi::DenseTensor& cached_total_keys_tensor = keys_tensor[device_id];
     uint64_t* total_keys =
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 05433c1014656..75adf94e1ce61 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -784,7 +784,8 @@ void FleetWrapper::PushDenseVarsSync(
     const uint64_t table_id,
     const std::vector<std::string>& var_names) {}
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
     (defined PADDLE_WITH_PSLIB)
 void FleetWrapper::PushDenseVarsAsync(
     const Scope& scope,
@@ -816,6 +817,9 @@ void FleetWrapper::PushDenseVarsAsync(
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
     hipEventSynchronize(event);
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, stream));
+    musaEventSynchronize(event);
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
     cudaEventSynchronize(event);
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index fb5cf91729256..7fa90285e4fb3 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -175,7 +175,8 @@ class FleetWrapper {
 // Push dense variables to server in async mode
 // Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
 // Param<out>: push_sparse_status
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void PushDenseVarsAsync(
       const Scope& scope,
       const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 2cae0721aefa9..b00b25b4eab8d 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -121,7 +121,8 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
            tensor->numel() *
                SizeOfType(framework::TransToProtoVarType(tensor->dtype())));
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     memory::Copy(platform::CPUPlace(),
                  data_ptr,
                  tensor->place(),
@@ -141,7 +142,8 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place,
@@ -169,7 +171,8 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   void* tensor_data = tensor->mutable_data(
       place, framework::TransToPhiDataType(ToVarType(req_var.data_type())));
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   memory::Copy(place,
                tensor_data,
                platform::CPUPlace(),
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h
index 77838fbec6d00..ec4bc3a984c2c 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
@@ -92,7 +92,8 @@ class HeterWrapper {
 
   framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void DeSerializeToTensor(Scope* scope,
                            const VariableMessage& req_var,
                            platform::Place place,
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 3296679e1eeeb..f49ca915fb674 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 #include <functional>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "gflags/gflags.h"
@@ -64,7 +65,8 @@ void IPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
     const platform::CUDAPlace &place, size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {}
@@ -93,6 +95,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
   platform::CUDADeviceGuard guard(place.device);
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream_));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_));
   callback_manager_.reset(
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index f3d9ec54e6968..5150c91d0af0c 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -85,7 +85,8 @@ class IPUGarbageCollector : public GarbageCollector {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
   UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 1c186373cdbb5..e80a292d17f92 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -3,7 +3,9 @@ add_subdirectory(memory_optimize_pass)
 add_subdirectory(multi_devices_graph_pass)
 if(NOT APPLE
    AND NOT WIN32
-   AND (WITH_GPU OR WITH_ROCM))
+   AND (WITH_GPU
+        OR WITH_ROCM
+        OR WITH_MUSA))
   add_subdirectory(fusion_group)
 endif()
 
@@ -159,7 +161,9 @@ if(WITH_TENSORRT)
   pass_library(split_layernorm_to_math_ops_pass inference)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   pass_library(cudnn_placement_pass base DEPS placement_pass_base)
   pass_library(embedding_eltwise_layernorm_fuse_pass inference)
 endif()
diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc
index 9ca3190fd092f..49b96836cfbbf 100644
--- a/paddle/fluid/framework/ir/cost_model.cc
+++ b/paddle/fluid/framework/ir/cost_model.cc
@@ -128,7 +128,8 @@ bool CostData::SetCostData(const ProgramDesc& program,
     double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs(
         main_thread_events[op_pop_index]);
     double gpu_time_ms = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs(
         main_thread_events[op_pop_index]);
 #endif
@@ -152,7 +153,8 @@ bool CostData::SetCostData(const ProgramDesc& program,
     double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs(
         main_thread_events[stop_profiler_idx]);
     double gpu_time_ms = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs(
         main_thread_events[stop_profiler_idx]);
 #endif
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index 299e700edb95d..951c861bb7a4b 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -34,8 +34,10 @@ namespace framework {
 namespace ir {
 
 void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+    CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
   graph = FuseBatchNormAct(graph, act_types);
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
index 506e8721298b6..df5cbfa9e7e0b 100644
--- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -25,7 +25,8 @@ namespace framework {
 namespace ir {
 
 void FuseBatchNormAddActPass::ApplyImpl(ir::Graph *graph) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 06593733e6a27..9cffdaed6a59e 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -27,7 +27,8 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index a0f1d9eed0038..67b154989d346 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -203,7 +203,8 @@ TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) {
            {});
 
   std::vector<bool> use_cuda_list{false};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   use_cuda_list.push_back(true);
 #endif
   for (auto use_cuda : use_cuda_list) {
diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
index 1e6a6f02e2230..dcd12bed40ad4 100644
--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -53,7 +53,8 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place,
     processor_count = std::thread::hardware_concurrency();
     if (processor_count) {
       if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         device_count = phi::backends::gpu::GetGPUDeviceCount();
 #endif
       }
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 13896b66f3c55..16398806597e3 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -641,7 +641,8 @@ void BuildOpFuncList(const platform::Place& place,
             *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context);
         auto expected_kernel_key = framework::TransPhiKernelKeyToOpKernelType(
             op_with_kernel->GetExpectedKernelType(exec_ctx));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         if (op_with_kernel->CanCUDNNBeUsed(exec_ctx,
                                            expected_kernel_key.data_type_)) {
           expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN;
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index 1ae7e5e59ce1f..6680af7eb3206 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -48,7 +48,8 @@ DECLARE_bool(benchmark);
 DECLARE_uint64(executor_log_deps_every_microseconds);
 PHI_DECLARE_bool(new_executor_use_cuda_graph);
 PHI_DECLARE_bool(enable_new_ir_in_executor);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 3b40a3b0727f1..9382d7a4bd090 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -892,7 +892,8 @@ void NewIRInterpreter::RunOperator(const Instruction& instr_node) {
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op->Type()
             << "): context wait and get last error";
@@ -1245,7 +1246,8 @@ void NewIRInterpreter::RecordStreamForGC(const Instruction& instr) {
 void NewIRInterpreter::CheckGC(const Instruction& instr) {
   platform::RecordEvent record(
       "CheckGC", platform::TracerEventType::UserDefined, 10);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   RecordStreamForGC(instr);
 #endif
   auto& var_scope = var_scope_;
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index 95eee77d36288..d72f201a9e02b 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -42,7 +42,8 @@ class ProfilerGuard {
  private:
   void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       auto cuda_place = place;
       cost_info_->device_memory_bytes =
           platform::RecordedGpuMallocSize(cuda_place.device);
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index b6c54192a6970..d14bc40d32217 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -880,7 +880,8 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) {
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op->Type()
             << "): context wait and get last error";
@@ -1232,7 +1233,8 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
 void ProgramInterpreter::CheckGC(const Instruction& instr) {
   platform::RecordEvent record(
       "CheckGC", platform::TracerEventType::UserDefined, 10);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   RecordStreamForGC(instr);
 #endif
   auto& var_scope = var_scope_;
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 8cb29a0d5df4c..8bc37165e67b8 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -358,7 +358,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
 #else
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 74e4b04d0535f..ed536dbd56250 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -58,7 +58,8 @@ class DenseTensor;
 #include "paddle/fluid/platform/mkldnn_op_list.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
@@ -1516,7 +1517,8 @@ bool OperatorWithKernel::SupportsKernelType(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (this->CanCUDNNBeUsed(exe_ctx, kernel_type.data_type_)) {
     auto tmp_kernel_type = kernel_type;
     tmp_kernel_type.library_type_ = framework::LibraryType::kCUDNN;
@@ -1544,7 +1546,8 @@ bool OperatorWithKernel::CanCUDNNBeUsed(const framework::ExecutionContext& ctx,
   bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr<bool>("use_cudnn") &&
                    paddle::platform::is_gpu_place(ctx.GetPlace());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (use_cudnn) {
     auto& dev_ctx = ctx.device_context<phi::GPUContext>();
     use_cudnn &= (dev_ctx.cudnn_handle() != nullptr);
@@ -1783,7 +1786,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       if (this->CanCUDNNBeUsed(exe_ctx, kernel_type_->data_type_)) {
         kernel_type_->library_type_ = framework::LibraryType::kCUDNN;
       }
@@ -2109,7 +2113,8 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (this->CanCUDNNBeUsed(ctx, expected_kernel_key.data_type_)) {
     expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN;
   }
@@ -2132,7 +2137,8 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
       // CPUKernel will be executed and a warning will be given at the same
       // time.
       expected_kernel_key.place_ = platform::CPUPlace();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       if (SupportGPU()) {
         auto& dev_ctx = ctx.device_context();
         expected_kernel_key.place_ = dev_ctx.GetPlace();
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e6a2058107b1d..50802e83fd7fa 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -575,7 +575,8 @@ class ExecutionContext : public phi::KernelContext {
     return device_context_;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   const inline phi::GPUContext& cuda_device_context() const {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()),
                       true,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 806b8570108b9..b85a7bb0fa381 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -41,14 +41,16 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
 
 PHI_DECLARE_double(eager_delete_tensor_gb);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -69,7 +71,8 @@ static std::once_flag gProfileOnce;
 static bool gProfileStarted = false;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 std::once_flag p2p_init_flag;
 #endif
 
@@ -512,7 +515,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
     }
     std::unique_ptr<GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(place, max_memory_size));
       } else {
@@ -621,7 +625,8 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
 }
 
 void InitP2P(const std::vector<platform::Place> &places) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::call_once(p2p_init_flag, [&]() {
     int count = places.size();
     if (count <= 1) return;
@@ -642,6 +647,10 @@ void InitP2P(const std::vector<platform::Place> &places) {
         hipError_t ret =
             hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
         if (ret != hipSuccess || can_acess != 1) {
+#elif defined(PADDLE_WITH_MUSA)
+        musaError_t ret =
+            musaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
+        if (ret != musaSuccess || can_acess != 1) {
 #else
         cudaError_t ret =
             cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
@@ -653,6 +662,8 @@ void InitP2P(const std::vector<platform::Place> &places) {
           platform::CUDADeviceGuard guard(devices[i]);
 #ifdef PADDLE_WITH_HIP
           hipDeviceEnablePeerAccess(devices[j], 0);
+#elif defined(PADDLE_WITH_MUSA)
+          musaDeviceEnablePeerAccess(devices[j], 0);
 #else
           cudaDeviceEnablePeerAccess(devices[j], 0);
 #endif
@@ -1299,7 +1310,9 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
         BuildStrategy::ReduceStrategy::kAllReduce;
     member_->use_all_reduce_ = true;
   }
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    defined(_WIN32)
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
         device_count,
@@ -1308,7 +1321,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
   }
 #endif
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
     (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
@@ -1674,7 +1688,8 @@ std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
     final_graphs = *async_graphs;
   } else if (member_->build_strategy_.enable_parallel_graph_) {
     VLOG(3) << "use ParallelSSAGraphExecutor";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     // TODO(Yancey1989): Remove passing in the main_program when
     // allreduce_seq_pass doesn't need it as the attr.
     bool is_inference = details::IsDataParallelInferenceGraph(*graph);
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 9881d479a75a2..070c85d425ee0 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -134,7 +134,8 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
         phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == phi::Backend::GPU ||
       kernel_key.backend() == phi::Backend::GPUDNN) {
     PADDLE_THROW(
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index f8589e95ff8e9..33493669755e9 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -72,7 +72,8 @@ struct ConvertToPhiContext<phi::CPUContext> {
   using TYPE = phi::CPUContext;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <>
 struct ConvertToPhiContext<phi::GPUContext> {
   using TYPE = phi::GPUContext;
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 7b61052a20151..40296242b1927 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -69,11 +69,12 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
   fleet_ptr_ = FleetWrapper::GetInstance();
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   copy_streams_.clear();
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU)
   places_.clear();
   thread_scopes_.clear();
 #endif
@@ -81,7 +82,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
 
 void PullDenseWorker::CreatePinVar() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU)
   // for (auto& v : dense_value_names_) {
   //  for (auto& name : v.second) {
   for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
@@ -96,7 +97,8 @@ void PullDenseWorker::CreatePinVar() {
       auto* ptr = root_scope_->Var(name + "pin");
       InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
       phi::DenseTensor* pin_tensor = ptr->GetMutable<phi::DenseTensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       pin_tensor->mutable_data<float>(tensor->dims(),
                                       platform::CUDAPinnedPlace());
 #endif
@@ -126,7 +128,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
   }
   status_vec->resize(0);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU)
 
   for (size_t i = 0; i < places_.size(); ++i) {
     // for (auto& v : dense_value_names_) {
@@ -144,7 +146,8 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
         Variable* var = thread_scopes_[i]->FindVar(name);
         phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
         float* w = tensor->data<float>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         memory::Copy(places_[i],
                      w,
                      platform::CUDAPinnedPlace(),
@@ -180,7 +183,7 @@ void PullDenseWorker::PullDense(bool force_update) {
         dwp_param_.program_config(0).pull_dense_table_id(i));
     if (force_update || CheckUpdateParam(tid)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU)
       VLOG(3) << "pull dense " << force_update << " " << tid;
       fleet_ptr_->PullDenseVarsAsync(*root_scope_,
                                      tid,
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 58e879a5011c2..d2f8a9f955608 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -228,7 +228,8 @@ void SectionWorker::TrainFiles() {
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
   if (max_memory_size >= 0) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(place_)) {
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 5ef6f53d38d50..37378d4d3a161 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -114,7 +114,8 @@ TEST(DenseTensor, MutableData) {
     EXPECT_EQ(static_cast<int>(p2[0]), 1);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     float* p1 = nullptr;
@@ -168,7 +169,8 @@ TEST(DenseTensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     phi::DenseTensor dst_tensor;
@@ -206,7 +208,8 @@ TEST(DenseTensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 9}),
@@ -295,7 +298,8 @@ TEST(DenseTensor, Split) {
       EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address);
     }
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 4}),
@@ -357,7 +361,8 @@ TEST(DenseTensor, Chunk) {
       EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address);
     }
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 4}),
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index d8224cb0dd72b..df8bfcbb5d473 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -124,7 +124,8 @@ void TensorCopyImpl(const TENSOR& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -377,7 +378,8 @@ void TensorCopySync(const phi::DenseTensor& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -479,7 +481,8 @@ void TensorToStream(std::ostream& os,
                       platform::errors::ResourceExhausted(
                           "tensor size %d overflow when writing tensor", size));
     if (platform::is_gpu_place(tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
       std::unique_ptr<char[]> buf(new char[kBufSize]);
       auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
@@ -614,7 +617,8 @@ void TensorFromStream(std::istream& is,
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(phi::make_ddim(shape));
       framework::VisitDataType(
@@ -687,7 +691,8 @@ void TensorFromStream(std::istream& is,
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(phi::make_ddim(dims));
       framework::VisitDataType(
@@ -809,7 +814,8 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
   if (dl_tensor.device.device_type == kDLCPU) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (dl_tensor.device.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(dl_tensor.device.device_id);
@@ -849,7 +855,8 @@ void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) {
     void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (src->dl_tensor.device.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(src->dl_tensor.device.device_id);
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 36a3e968251c9..c9ec8f0c34d79 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -129,7 +129,8 @@ void TensorFromArray(const T* src,
   if (platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -175,7 +176,8 @@ void TensorFromVector(const std::vector<T>& src,
   if (platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -304,7 +306,8 @@ void TensorToVector(const phi::DenseTensor& src,
   if (platform::is_cpu_place(src.place())) {
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -346,7 +349,8 @@ inline void TensorToVector(const phi::DenseTensor& src,
   if (platform::is_cpu_place(src.place())) {
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index bda2681f57f31..638114df3d2da 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -58,7 +58,8 @@ TEST(TensorCopy, Tensor) {
   }
   EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor src_tensor;
     phi::DenseTensor gpu_tensor;
@@ -153,7 +154,8 @@ TEST(TensorFromVector, Tensor) {
     delete cpu_place;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     phi::DenseTensor cpu_tensor;
@@ -232,7 +234,8 @@ TEST(TensorToVector, Tensor) {
       EXPECT_EQ(src_ptr[i], dst[i]);
     }
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     phi::DenseTensor gpu_tensor;
@@ -323,7 +326,8 @@ TEST(TensorFromDLPack, Tensor) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     phi::DenseTensor cpu_tensor;
@@ -489,7 +493,8 @@ TEST(Tensor, FromAndToStream) {
     EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
     delete place;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     phi::DenseTensor gpu_tensor;
     gpu_tensor.Resize({2, 3});
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index bf69bed9d4851..4d9b39a77ec04 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -172,7 +172,8 @@ class HeterServiceContext {
   int place_num_;
   Scope* scope_{nullptr};
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   gpuEvent_t event_;
 #endif
   std::vector<OperatorBase*> ops_;
@@ -204,7 +205,8 @@ class HeterXpuTrainer : public TrainerBase {
   virtual std::string GetDumpPath(int tid) { return ""; }
   virtual void InitDumpEnv() {}
   template <typename T>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void HeterMemCpy(phi::DenseTensor* tensor,
                    phi::DenseTensor* root_tensor,
                    const paddle::platform::Place& thread_place,
@@ -242,7 +244,8 @@ class HeterXpuTrainer : public TrainerBase {
   std::vector<Scope*> place_scopes_;
   BtObjectPool<HeterServiceContext> object_pool_;
   std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::vector<gpuStream_t> copy_streams_;
   std::vector<gpuEvent_t> events_;
 #endif
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index d73c9b7d95957..0b289b8a6ddff 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -37,6 +37,13 @@
 #include "paddle/fluid/operators/miopen_rnn_cache.h"
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#if defined(PADDLE_WITH_MCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"   // NOLINT
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"  // NOLINT
+#endif
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 2e188e6caa076..b62347b1561bf 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -33,6 +33,12 @@
 #include <nccl.h>
 #endif
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <mudnn.h>
+#if defined(PADDLE_WITH_MCCL)
+#include <mccl.h>
+#endif
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <miopen/miopen.h>
 #ifdef PADDLE_WITH_RCCL
@@ -59,7 +65,8 @@ class SparseCsrTensor;
 namespace paddle {
 
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class Communicator;
 class NCCLCommunicator;
@@ -189,14 +196,18 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
     FetchList,
     FeedList,
     operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_MCCL)
     ncclUniqueId,
     platform::Communicator,
     platform::NCCLCommunicator,
 #endif
+#ifndef PADDLE_WITH_MUSA
     operators::CudnnRNNCache,
 #endif
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId,
     platform::BKCLCommunicator,
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index be715a2a451ad..da93b60b4a280 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -138,7 +138,8 @@ AmpOperators::AmpOperators()
       block_ops_(new std::unordered_set<std::string>()),
       unsupported_fp16_ops_(new std::unordered_set<std::string>()),
       unsupported_bf16_ops_(new std::unordered_set<std::string>()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   auto unsupported_ops_gpu_fp16 = std::get<2>(
       OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16));
   unsupported_fp16_ops_->insert(unsupported_ops_gpu_fp16.begin(),
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 14b9bc5aae0bc..7199762e0c5ac 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -204,7 +204,8 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
 
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     PADDLE_TENSOR_ADD(float, phi::GPUContext);
     PADDLE_TENSOR_ADD(double, phi::GPUContext);
     PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext);
@@ -313,7 +314,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
     return;                                                              \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
@@ -321,7 +323,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
 #endif
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -364,7 +367,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
     return;                                                            \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
@@ -372,7 +376,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 #endif
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -425,7 +430,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
     return dst_var;                                                  \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
@@ -441,7 +447,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
 #if defined(PADDLE_WITH_XPU)
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -712,7 +719,8 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
         }
       }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       if (paddle::platform::is_gpu_place(place)) {
         // sum selected rows firstly
         for (auto& var_info : tmp_grad_vars_) {
@@ -778,7 +786,8 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           // Increase count
           IncreaseCurCnt();
         }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       }
 #endif
       tmp_grad_vars_.clear();
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index cda2fad5d7436..206c3e562e70a 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -205,7 +205,8 @@ PreparedOp PrepareImpl(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (op.CanCUDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.dtype())) {
     expected_kernel_key.set_backend(phi::Backend::GPUDNN);
   }
@@ -555,7 +556,8 @@ static void PreparedOpRunImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
@@ -645,7 +647,8 @@ static void PreparedOpRunPtImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index ccb58d320221c..2c0669aa12883 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -106,7 +106,8 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
   if (gcs_.count(place) == 0) {
     std::unique_ptr<framework::GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       gc.reset(new framework::DefaultStreamGarbageCollector(place, 0));
 
       VLOG(10) << "Created GarbageCollector at " << place;
@@ -116,7 +117,8 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
           "Please recompile or reinstall Paddle with GPU support."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0));
 
       VLOG(10) << "Created GarbageCollector at " << place;
@@ -274,7 +276,8 @@ void Tracer::TraceOpImpl(const std::string& type,
 
   try {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       platform::SetDeviceId(place.device);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 65e149925e742..4777082196771 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -36,7 +36,8 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
@@ -209,7 +210,8 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       argument->scope_valid(),
       true,
       platform::errors::PreconditionNotMet("The scope field should be valid"));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (argument->use_gpu_valid()) {
     CopyParamsToGpu(argument);
   }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index ee29af1c13308..86f8a12539809 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -32,7 +32,8 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
   std::string repr() const override;
 
  private:
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void CopyParamsToGpu(Argument *argument);
 #endif
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 25c7e7e2a03d4..902034a9bd899 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -32,7 +32,8 @@
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 #endif
 
@@ -100,7 +101,8 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                   int device_id,
                                   Precision precision_mode) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   use_gpu_ = true;
   memory_pool_init_size_mb_ = memory_pool_init_size_mb;
   FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_;
@@ -630,7 +632,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 }
 
 void AnalysisConfig::EnableCUDNN() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   use_cudnn_ = use_gpu_;
 #else
   LOG(ERROR) << "Please compile with CUDA first to use cuDNN";
@@ -928,7 +931,8 @@ void AnalysisConfig::Update() {
   }
 
   if (use_gpu() && use_cudnn_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (!enable_ir_optim_) {
       LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled.";
     } else {
@@ -1145,7 +1149,8 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads(
 }
 
 float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // Get the GPU memory details and calculate the fraction of memory for the
   // GPU memory pool.
   size_t gpu_total, gpu_available;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 56652c2f42cb7..d6c535c591cda 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -99,7 +99,8 @@
 
 namespace paddle {
 namespace {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
                                 GPUContextResource *gpu_resource,
                                 Place place_) {
@@ -131,7 +132,9 @@ void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
   gpu_context->SetBlasTF32Handle(
       gpu_resource->GetBlasTF32TensorCoreHandleCreator());
   gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
+#ifndef PADDLE_WITH_MUSA
   gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator());
+#endif
   gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
   gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice());
 
@@ -270,7 +273,8 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
                       false,
                       platform::errors::InvalidArgument(
                           "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
     auto dst_gpu_place = place;
@@ -370,7 +374,8 @@ bool AnalysisPredictor::Init(
     return true;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // TODO(inference): Now only gpu with external stream support private
   // device_context.
   if (config_.use_gpu_ && config_.use_external_stream_) {
@@ -418,7 +423,8 @@ void AnalysisPredictor::InitPlace() {
                       platform::errors::InvalidArgument(
                           "Only one choice can be made between CPU and XPU."));
     place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (config_.thread_local_stream_enabled()) {
       LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. "
                                  "Please use config.SetExecStream instead.";
@@ -489,14 +495,16 @@ void AnalysisPredictor::InitPlace() {
 }
 
 void AnalysisPredictor::InitResourceManager(void *stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   predictor_stream_ =
       ResourceManager::Instance().InitGPUResource(place_, stream);
 #endif
 }
 
 void AnalysisPredictor::InitDeviceContexts() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // Init GPUContext.
   if (place_.GetType() == phi::AllocationType::GPU) {
     device_contexts_.emplace(
@@ -534,7 +542,8 @@ void AnalysisPredictor::InitDeviceContexts() {
 }
 
 void *AnalysisPredictor::GetExecStream() const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (place_.GetType() == phi::AllocationType::GPU) {
     if (private_context_) {
       return predictor_stream_;
@@ -2151,7 +2160,8 @@ bool AnalysisPredictor::ZeroCopyRun() {
   return true;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   if (!private_context_) {
     PADDLE_THROW(platform::errors::Fatal(
@@ -2162,6 +2172,8 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   if (stream != predictor_stream_) {
 #ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
+#elif defined(PADDLE_WITH_MUSA)
+    musaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #else
     cudaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #endif
@@ -2199,11 +2211,14 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() {
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
     if (config_.use_gpu()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       auto *dev_ctx = pool.Get(place_);
       auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
 #ifdef PADDLE_WITH_HIP
       hipStreamSynchronize(stream);
+#elif defined(PADDLE_WITH_MUSA)
+      musaStreamSynchronize(stream);
 #else
       cudaStreamSynchronize(stream);
 #endif
@@ -2595,7 +2610,8 @@ AnalysisPredictor::~AnalysisPredictor() {
   if (config_.shape_range_info_collected()) {
     StatisticShapeRangeInfo();
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (predictor_stream_ != nullptr) {
     ResourceManager::Instance().DestroyGPUResource(predictor_stream_);
   }
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index bde6ca48741ad..547cf0d2284be 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -220,7 +220,8 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool ZeroCopyRun() override;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // Note: Can only be used under thread_local semantics.
   bool ExpRunWithExternalStream(const gpuStream_t stream);
 #endif
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 28353150c265c..faf3cedce947d 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -250,7 +250,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
           false,
           platform::errors::InvalidArgument(
               "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place_));
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 37ee2b4df643d..dd219f2c59fd5 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -108,7 +108,8 @@ T *Tensor::mutable_data(PlaceType place) {
       return tensor->mutable_data<T>(paddle::platform::CPUPlace());
     }
     case static_cast<int>(PlaceType::kGPU): {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       paddle::platform::CUDAPlace gpu_place(device_);
       auto *dev_ctxs = reinterpret_cast<const std::map<
           phi::Place,
@@ -204,7 +205,8 @@ void Tensor::CopyFromCpu(const T *data) {
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
     paddle::platform::CUDAPlace gpu_place(device_);
     auto *dev_ctxs = reinterpret_cast<const std::map<
@@ -406,7 +408,8 @@ void Tensor::CopyToCpuImpl(T *data,
         "with IPU."));
 #endif
   } else if (place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     auto gpu_place = t_place;
     auto *dev_ctxs = reinterpret_cast<const std::map<
         phi::Place,
@@ -422,6 +425,8 @@ void Tensor::CopyToCpuImpl(T *data,
                          dev_ctx->stream());
 #ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(dev_ctx->stream());
+#elif defined(PADDLE_WITH_MUSA)
+    musaStreamSynchronize(dev_ctx->stream());
 #else
     // async, return stream
     if (nullptr != exec_stream) {
@@ -821,7 +826,8 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (t->place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     paddle::platform::CUDAPlace gpu_place(t->device_);
     auto *t_data = tensor->mutable_data<T>(gpu_place);
     paddle::memory::Copy(gpu_place,
@@ -891,7 +897,8 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t,
     std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
 #endif
   } else if (t->place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     paddle::memory::Copy(paddle::platform::CPUPlace(),
                          static_cast<void *>(data),
                          t_place,
diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc
index 533363f1b25da..b2abb21602dc9 100644
--- a/paddle/fluid/inference/api/infer_context.cc
+++ b/paddle/fluid/inference/api/infer_context.cc
@@ -21,7 +21,8 @@
 
 namespace paddle {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 InferGPUContext::InferGPUContext(const phi::Place& place)
     : phi::GPUContext(place, false) {}
 #endif
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
index 2b5c4e974eb08..eef3d31a5c493 100644
--- a/paddle/fluid/inference/api/infer_context.h
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -26,7 +26,8 @@ class InferCPUContext : public phi::CPUContext {
   using phi::CPUContext::SetEigenDevice;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class InferGPUContext : public phi::GPUContext {
  public:
   explicit InferGPUContext(const phi::Place& place);
@@ -35,7 +36,9 @@ class InferGPUContext : public phi::GPUContext {
   using phi::GPUContext::SetBlasTF32Handle;
   using phi::GPUContext::SetDnnHandle;
   using phi::GPUContext::SetEigenDevice;
+#ifndef PADDLE_WITH_MUSA
   using phi::GPUContext::SetSolverHandle;
+#endif
   using phi::GPUContext::SetSparseHandle;
   using phi::GPUContext::SetStream;
   // using phi::GPUContext::SetDnnWorkspaceHandle;
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index 3f06ee5722af9..a13fa97b69185 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -44,7 +44,8 @@
 namespace paddle {
 namespace internal {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class EigenGpuStreamDevice : public Eigen::StreamInterface {
  public:
   EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -102,6 +103,9 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
@@ -132,7 +136,8 @@ void CPUContextResource::InitCPUResource() {
 
 CPUContextResource::CPUContextResource() { InitCPUResource(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 GPUContextResource::GPUContextResource(const phi::Place& place, void* stream)
     : place_(place) {
   InitGPUResource(stream);
@@ -158,6 +163,8 @@ void GPUContextResource::DestroyGPUResource() {
   if (owned_stream_) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
 #endif
@@ -166,8 +173,10 @@ void GPUContextResource::DestroyGPUResource() {
 
   DestroyDnnHandle();
   DestroyBlasHandle();
+#ifndef PADDLE_WITH_MUSA
   DestroyBlasLtHandle();
   DestroySolverHandle();
+#endif
   DestroySparseHandle();
 }
 
@@ -204,7 +213,7 @@ void GPUContextResource::DestroyBlasHandle() {
   phi::DestroyBlasHandle(blas_tensor_core_handle_);
   phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
 }
-
+#ifndef PADDLE_WITH_MUSA
 void GPUContextResource::InitBlasLtHandle() {
   phi::InitBlasLtHandle(&blaslt_handle_);
 }
@@ -220,6 +229,7 @@ void GPUContextResource::InitSolverHandle() {
 void GPUContextResource::DestroySolverHandle() {
   phi::DestroySolverHandle(solver_handle_);
 }
+#endif
 
 void GPUContextResource::InitSparseHandle() {
   phi::InitSparseHandle(&sparse_handle_, stream_);
@@ -287,6 +297,7 @@ GPUContextResource::GetBlasTF32TensorCoreHandleCreator() {
   };
 }
 
+#ifndef PADDLE_WITH_MUSA
 blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
   return blaslt_handle_;
 }
@@ -310,6 +321,7 @@ GPUContextResource::GetSolverDnHandleCreator() {
     return solver_handle_;
   };
 }
+#endif
 
 phi::sparseHandle_t GPUContextResource::GetSparseHandle() const {
   return sparse_handle_;
@@ -375,7 +387,8 @@ CPUContextResource* ResourceManager::GetCPUResource() const {
   return cpu_resource_.get();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) {
   std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
   if (gpu_resources_.count(stream)) {
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index e14de1c2ffc86..36841a46c4878 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -25,7 +25,8 @@
 #include "paddle/phi/common/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
@@ -49,7 +50,8 @@ class CPUContextResource {
   std::unique_ptr<Eigen::DefaultDevice> cpu_eigen_device_;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class GPUContextResource {
  public:
   explicit GPUContextResource(const phi::Place& place, void* stream);
@@ -60,8 +62,10 @@ class GPUContextResource {
   std::function<phi::blasHandle_t()> GetBlasHandleCreator();
   std::function<phi::blasHandle_t()> GetBlasTensorCoreHandleCreator();
   std::function<phi::blasHandle_t()> GetBlasTF32TensorCoreHandleCreator();
+#ifndef PADDLE_WITH_MUSA
   std::function<phi::blasLtHandle_t()> GetBlasLtHandleCreator();
   std::function<phi::solverHandle_t()> GetSolverDnHandleCreator();
+#endif
   std::function<phi::sparseHandle_t()> GetSparseHandleCreator();
   std::function<Eigen::GpuDevice*()> GetGpuEigenDeviceCreator();
 
@@ -70,8 +74,10 @@ class GPUContextResource {
   blasHandle_t GetBlasHandle() const;
   blasHandle_t GetBlasTensorCoreHandle() const;
   blasHandle_t GetBlasTF32Handle() const;
+#ifndef PADDLE_WITH_MUSA
   blasLtHandle_t GetBlasLtHandle() const;
   phi::solverHandle_t GetSolverDnHandle() const;
+#endif
   phi::sparseHandle_t GetSparseHandle() const;
   Eigen::GpuDevice* GetGpuEigenDevice() const;
   int GetGpuComputeCapability() const;
@@ -90,10 +96,12 @@ class GPUContextResource {
   void InitDnnHanlde();
   void DestroyDnnHandle();
   void DestroyBlasHandle();
+#ifndef PADDLE_WITH_MUSA
   void InitBlasLtHandle();
   void DestroyBlasLtHandle();
   void InitSolverHandle();
   void DestroySolverHandle();
+#endif
   void InitSparseHandle();
   void DestroySparseHandle();
 
@@ -116,9 +124,11 @@ class GPUContextResource {
   blasHandle_t blas_handle_{nullptr};
   blasHandle_t blas_tensor_core_handle_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+#ifndef PADDLE_WITH_MUSA
   blasLtHandle_t blaslt_handle_{nullptr};
-  dnnHandle_t dnn_handle_{nullptr};
   phi::solverHandle_t solver_handle_{nullptr};
+#endif
+  dnnHandle_t dnn_handle_{nullptr};
   phi::sparseHandle_t sparse_handle_{nullptr};
   // DnnWorkspaceHandle
 };
@@ -141,7 +151,8 @@ class ResourceManager {
   std::mutex cpu_mutex_;
   std::unique_ptr<CPUContextResource> cpu_resource_{nullptr};
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // GPU Resource
  public:
   void* InitGPUResource(const phi::Place& place, void* stream);
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 6de5f9cfa0ca1..edf34de39f4e6 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -127,7 +127,8 @@ void MemoryCopyAsync(const platform::Place& dst_place,
   if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) {
     memory::Copy(cpu_place, dst_data, cpu_place, src_data, size);
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_cpu_place(dst_place) &&
         platform::is_gpu_place(src_place)) {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 2500f624967c6..3c8f0694ee774 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -218,6 +218,9 @@ void QkvToContextPluginDynamic::configurePlugin(
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 1a39590398911..c044d25053ba3 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -19,7 +19,9 @@ set(ALLOCATOR_SRCS
     buddy_allocator.cc
     system_allocator.cc)
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   list(
     APPEND
     ALLOCATOR_SRCS
@@ -89,6 +91,10 @@ if(WITH_ROCM)
     SRCS thread_local_allocator_test.cc
     DEPS allocator)
 endif()
+if(WITH_MUSA)
+  musa_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc
+            DEPS allocator)
+endif()
 
 if(WITH_GPU)
   nv_test(
@@ -100,6 +106,15 @@ elseif(WITH_ROCM)
     best_fit_allocator_test
     SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
     DEPS allocator memcpy)
+elseif(WITH_MUSA)
+  musa_test(
+    best_fit_allocator_test
+    SRCS
+    best_fit_allocator_test.cc
+    best_fit_allocator_test.cu
+    DEPS
+    allocator
+    memcpy)
 else()
   cc_test_old(best_fit_allocator_test SRCS best_fit_allocator_test.cc DEPS
               allocator)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 07e55115ba130..6ba81821871f8 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -27,7 +27,8 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/macros.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include <shared_mutex>
 
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -164,7 +165,8 @@ class AllocatorFacadePrivate {
  public:
   using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   using CUDAAllocatorMap =
       std::map<platform::CUDAPlace,
                std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
@@ -187,7 +189,8 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -214,7 +217,8 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kAutoGrowth: {
         InitNaiveBestFitCPUAllocator();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
@@ -286,7 +290,8 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -345,7 +350,8 @@ class AllocatorFacadePrivate {
            LIKELY(FLAGS_use_system_allocator == false);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) {
     auto it = cuda_allocators_.find(place);
     if (it == cuda_allocators_.end()) {
@@ -594,7 +600,8 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void InitNaiveBestFitCUDAPinnedAllocator() {
     allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
@@ -655,7 +662,7 @@ class AllocatorFacadePrivate {
     auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
     VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
             << FLAGS_auto_growth_chunk_size_in_mb;
-#if defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto cuda_allocator = CreateCUDAAllocator(p);
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator,
@@ -741,7 +748,7 @@ class AllocatorFacadePrivate {
     auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
     VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
             << FLAGS_auto_growth_chunk_size_in_mb;
-#if defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto cuda_allocator = CreateCUDAAllocator(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator,
@@ -1038,7 +1045,8 @@ class AllocatorFacadePrivate {
       system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
     int device_count = platform::GetGPUDeviceCount();
@@ -1064,7 +1072,8 @@ class AllocatorFacadePrivate {
     if (!zero_size_allocators_.empty()) return;
     std::vector<platform::Place> places;
     places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     int device_count = platform::GetGPUDeviceCount();
     for (int dev_id = 0; dev_id < device_count; ++dev_id) {
       places.emplace_back(platform::CUDAPlace(dev_id));
@@ -1112,7 +1121,8 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe(allocators_);
     CheckAllocThreadSafe(zero_size_allocators_);
     CheckAllocThreadSafe(system_allocators_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (is_stream_safe_cuda_allocator_used_) {
       CheckCUDAAllocThreadSafe(cuda_allocators_);
     }
@@ -1145,7 +1155,8 @@ class AllocatorFacadePrivate {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // a standalone CUDA allocator to support multi-stream GC in new executor
   std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
       default_stream_safe_cuda_allocators_;
@@ -1252,7 +1263,8 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size,
                                      const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   AllocatorFacadePrivate* m = GetPrivate();
   if (!m->IsStreamSafeCUDAAllocatorUsed()) {
     VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
@@ -1278,7 +1290,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 bool AllocatorFacade::InSameStream(
     const std::shared_ptr<phi::Allocation>& allocation,
     const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
   return s == GetStream(allocation);
 #else
@@ -1290,7 +1303,8 @@ bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
   return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                   gpuStream_t stream) {
   AllocatorFacadePrivate* m = GetPrivate();
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index a1f21a5e69359..92bbc03378be2 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -76,7 +76,8 @@ class AllocatorFacade {
 
   bool IsStreamSafeCUDAAllocatorUsed();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
   uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream);
   void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
index 1e09c43c4f12f..fe905932c626b 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
@@ -17,7 +17,8 @@
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -46,7 +47,8 @@ void AllocateTestCases() {
     ASSERT_EQ(cpu_allocation->size(), size);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     place = platform::CUDAPlace(0);
     size = 1024;
@@ -82,7 +84,8 @@ void AllocateTestCases() {
 }
 
 TEST(Allocator, SpecifyGpuMemory) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
   // FLAGS_reallocate_gpu_memory_in_mb
   FLAGS_fraction_of_gpu_memory_to_use = 0.0;
diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
index 63e3eab3256c9..b88c952243a06 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
@@ -17,7 +17,8 @@
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -46,7 +47,8 @@ void AllocateTestCases() {
     ASSERT_EQ(cpu_allocation->size(), size);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     place = platform::CUDAPlace(0);
     size = 1024;
@@ -82,7 +84,8 @@ void AllocateTestCases() {
 }
 
 TEST(Allocator, Allocator) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   FLAGS_fraction_of_gpu_memory_to_use = 0.01;
   FLAGS_gpu_allocator_retry_time = 500;
   FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
index bfd05b6b323fe..47a4be778819b 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
@@ -23,7 +23,8 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 DECLARE_int64(gpu_allocator_retry_time);
@@ -41,7 +42,8 @@ static inline size_t AlignTo(size_t size, size_t alignment) {
 }
 
 TEST(allocator, allocator) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   FLAGS_fraction_of_gpu_memory_to_use = 0.01;
   FLAGS_gpu_allocator_retry_time = 500;
   FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
@@ -102,7 +104,8 @@ TEST(allocator, allocator) {
 
 TEST(multithread_allocate, test_segfault) {
   FLAGS_allocator_strategy = "auto_growth";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::mutex mtx;
   std::condition_variable cv;
   bool flag = false;
diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc
index 8de464754cb35..8089f21a3619f 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -19,7 +19,8 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #define USE_DEVICE
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
@@ -53,7 +54,8 @@ BuddyAllocator::BuddyAllocator(
           platform::PlaceHelper::CreatePlace(dev_type));
     };
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     init_allocate_size_func_ = &platform::GpuInitAllocSize;
     re_allocate_size_func_ = &platform::GpuReallocSize;
 #endif
@@ -249,7 +251,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   allocate_bytes = DeviceAllocateSize(
       init_allocate_size_func_, re_allocate_size_func_, request_bytes);
 #else
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   allocate_bytes = DeviceAllocateSize(
       &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
 #endif
diff --git a/paddle/fluid/memory/allocation/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
index 1aeb1722d0ec8..e74544e292306 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
@@ -26,7 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
@@ -77,7 +78,8 @@ int* TestBuddyAllocator(BuddyAllocator* allocator,
   return nullptr;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 TEST(BuddyAllocator, GpuFraction) {
   // In a 16 GB machine, the pool size will be about 160 MB
   FLAGS_fraction_of_gpu_memory_to_use = 0.01;
@@ -244,6 +246,9 @@ TEST(BuddyAllocator, AllocFromAvailable) {
 #ifdef PADDLE_WITH_HIP
   hipError_t result = hipMalloc(&p, available >> 1);
   EXPECT_TRUE(result == hipSuccess);
+#elif defined(PADDLE_WITH_MUSA)
+  musaError_t result = musaMalloc(&p, available >> 1);
+  EXPECT_TRUE(result == musaSuccess);
 #else
   cudaError_t result = cudaMalloc(&p, available >> 1);
   EXPECT_TRUE(result == cudaSuccess);
@@ -263,6 +268,8 @@ TEST(BuddyAllocator, AllocFromAvailable) {
   if (p) {
 #ifdef PADDLE_WITH_HIP
     EXPECT_TRUE(hipFree(p) == hipSuccess);
+#elif defined(PADDLE_WITH_MUSA)
+    EXPECT_TRUE(musaFree(p) == musaSuccess);
 #else
     EXPECT_TRUE(cudaFree(p) == cudaSuccess);
 #endif
@@ -278,6 +285,8 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
 
 #ifdef PADDLE_WITH_HIP
   EXPECT_TRUE(hipMalloc(&p, static_cast<size_t>(1) << 30) == hipSuccess);
+#elif defined(PADDLE_WITH_MUSA)
+  EXPECT_TRUE(musaMalloc(&p, static_cast<size_t>(1) << 30) == musaSuccess);
 #else
   EXPECT_TRUE(cudaMalloc(&p, static_cast<size_t>(1) << 30) == cudaSuccess);
 #endif
@@ -294,6 +303,8 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
   if (p) {
 #ifdef PADDLE_WITH_HIP
     EXPECT_TRUE(hipFree(p) == hipSuccess);
+#elif defined(PADDLE_WITH_MUSA)
+    EXPECT_TRUE(musaFree(p) == musaSuccess);
 #else
     EXPECT_TRUE(cudaFree(p) == cudaSuccess);
 #endif
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 781addd7dba60..51e6c88d55d50 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -19,6 +19,11 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 7286f84160c6a..42e6f7be8de31 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -82,6 +82,9 @@ class GPUContextAllocator : public Allocator {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&event_, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaEventCreate(&event_, musaEventDisableTiming));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreate(&event_, cudaEventDisableTiming));
@@ -92,8 +95,9 @@ class GPUContextAllocator : public Allocator {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
-
       PADDLE_WARN_GPU_SUCCESS(hipEventDestroy(event_));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_WARN_GPU_SUCCESS(musaEventDestroy(event_));
 #else
       PADDLE_WARN_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
@@ -113,6 +117,9 @@ class GPUContextAllocator : public Allocator {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, default_stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(default_stream_, event_, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0));
diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
index 06e9fbe88827b..d1b68212736ee 100644
--- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
@@ -19,6 +19,11 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.h b/paddle/fluid/memory/allocation/cuda_managed_allocator.h
index a01e1c58d439b..3fdcfb8038086 100644
--- a/paddle/fluid/memory/allocation/cuda_managed_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <mutex>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index e436e6c439081..e4b0273a6efc3 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -26,7 +26,8 @@
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/common/place.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
@@ -213,7 +214,8 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 }
 
 // For CUDA
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class GPUBuddyAllocatorList {
  private:
   GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) {
@@ -283,7 +285,8 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
 
 template <>
 size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
+     defined PADDLE_WITH_MUSA)
   return GetGPUBuddyAllocator(place.device)->Used();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -294,7 +297,8 @@ size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 template <>
 void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                  size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto *ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
@@ -315,6 +319,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
     if (FLAGS_init_allocated_mem) {
 #ifdef PADDLE_WITH_HIP
       hipMemset(ptr, 0xEF, size);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(ptr, 0xEF, size);
 #else
       cudaMemset(ptr, 0xEF, size);
 #endif
@@ -331,7 +337,8 @@ template <>
 void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                void *p,
                                size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   GetGPUBuddyAllocator(place.device)->Free(p);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -341,7 +348,8 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
 
 template <>
 uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   return GetGPUBuddyAllocator(place.device)->Release();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -349,7 +357,8 @@ uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
   static std::once_flag init_flag;
   static BuddyAllocator *ba = nullptr;
@@ -367,7 +376,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
 
 template <>
 size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   return GetCUDAPinnedBuddyAllocator()->Used();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -378,7 +388,8 @@ size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
 template <>
 void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                        size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
   void *ptr = buddy_allocator->Alloc(size);
@@ -401,7 +412,8 @@ template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                      void *p,
                                      size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Free " << size << " bytes on " << platform::Place(place);
   GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -413,7 +425,8 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 template <>
 uint64_t Release<platform::CUDAPinnedPlace>(
     const platform::CUDAPinnedPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Release on " << platform::Place(place);
   return GetCUDAPinnedBuddyAllocator()->Release();
 #else
@@ -602,7 +615,8 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   return Used(gpu);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -611,7 +625,8 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   return Used(cuda_pinned);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
index 37da748ee9c96..b6be358fde05c 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -33,7 +33,8 @@ TEST(NaiveBestFitAllocatorTest, CpuAlloc) {
   alloc.Release(platform::CPUPlace());
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 TEST(NaiveBestFitAllocatorTest, GpuAlloc) {
   NaiveBestFitAllocator alloc{platform::CUDAPlace(0)};
   {
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index f1c0178fafc02..567ec4e4c9461 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -23,6 +23,8 @@ bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
 void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaFreeHost(allocation->ptr()));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
@@ -37,6 +39,8 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   void *ptr;
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaHostAlloc(&ptr, size, musaHostAllocPortable));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index d1872ee00b7b7..115fa600ad972 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -19,7 +19,8 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #endif
 
@@ -114,7 +115,8 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   {
     platform::CUDAPlace p(0);
     RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms);
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 9f513448eea26..ae9738ee2afd8 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -86,6 +86,16 @@ bool StreamSafeCUDAAllocation::CanBeFreed() {
     }
     PADDLE_ENFORCE_GPU_SUCCESS(err);
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
+#elif defined(PADDLE_WITH_MUSA)
+    gpuError_t err = musaEventQuery(event);
+    if (err == musaErrorNotReady) {
+      VLOG(9) << "Event " << event << " for " << ptr() << " is not completed";
+      // Erase the completded event before "it"
+      outstanding_event_map_.erase(outstanding_event_map_.begin(), it);
+      return false;
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
 #else
     gpuError_t err = hipEventQuery(event);
     if (err == hipErrorNotReady) {
@@ -122,6 +132,9 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaEventCreateWithFlags(&new_event, musaEventDisableTiming));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
@@ -136,6 +149,8 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
 
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(record_event, stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
 #endif
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 08ecdd4969730..efa0e8393aa20 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -16,6 +16,7 @@
 
 #include <list>
 #include <map>
+#include <mutex>
 #include <set>
 
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -24,6 +25,8 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index 210be01669775..0ef6b35f8cdac 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -33,7 +33,8 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
@@ -120,7 +121,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool CPUAllocator::UseGpu() const { return false; }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
@@ -216,6 +218,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 // PINNED memory is visible to all CUDA contexts.
 #ifdef PADDLE_WITH_HIP
   hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable);
+#elif defined(PADDLE_WITH_MUSA)
+  musaError_t result = musaHostAlloc(&p, size, musaHostAllocPortable);
 #else
   cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);
 #endif
@@ -259,6 +263,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
         platform::errors::Fatal(
             "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
   }
+#elif defined(PADDLE_WITH_MUSA)
+  err = musaFreeHost(p);
+  if (err != musaErrorMusartUnloading) {
+    PADDLE_ENFORCE_EQ(
+        err,
+        0,
+        platform::errors::Fatal(
+            "musaFreeHost failed in GPUPinnedAllocator, error code is %d",
+            err));
+  }
 #else
   err = cudaFreeHost(p);
 
diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h
index 67376a3e39a22..cde4743e620a9 100644
--- a/paddle/fluid/memory/allocation/system_allocator.h
+++ b/paddle/fluid/memory/allocation/system_allocator.h
@@ -43,7 +43,8 @@ class CPUAllocator : public SystemAllocator {
   virtual bool UseGpu() const;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class GPUAllocator : public SystemAllocator {
  public:
   explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
diff --git a/paddle/fluid/memory/allocation/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc
index e04d14f0adfde..d6a203ef38f47 100644
--- a/paddle/fluid/memory/allocation/system_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/system_allocator_test.cc
@@ -57,7 +57,8 @@ TEST(CPUAllocator, LockMem) {
   TestAllocator(&a, 0);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a(0);
   TestAllocator(&a, 2048);
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 46f9b1189cb68..01220b0e44240 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -57,7 +57,8 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
   return allocation::AllocatorFacade::Instance().GetBasePtr(allocation);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream) {
   return allocation::AllocatorFacade::Instance().Release(place, stream);
 }
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index b8f5f0289c4bc..2e029c4ebae88 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -48,7 +48,8 @@ extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
 
 extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 extern uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream);
 
 void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 4a56a01e640bf..c8ce60e7c39d6 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -256,7 +256,8 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
 
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
 #ifdef PADDLE_WITH_HIP
@@ -271,6 +272,18 @@ inline void SyncCUDAStream() {
   }
 #endif
 }
+#elif defined(PADDLE_WITH_MUSA)
+inline void SyncCUDAStream() {
+#if !defined(_WIN32)
+  musaStreamSynchronize(0);
+#else
+  musaError_t e_sync = musaSuccess;
+  while (e_sync = musaStreamQuery(0)) {
+    if (e_sync == musaErrorNotReady) continue;
+    break;
+  }
+#endif
+}
 #else
 inline void SyncCUDAStream() {
 #if !defined(_WIN32)
@@ -313,6 +326,12 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
                              num,
                              hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -325,6 +344,8 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
         "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
 #endif
@@ -357,6 +378,12 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
                              num,
                              hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -369,6 +396,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
         "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
 #endif
@@ -403,6 +432,12 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                num,
                                hipMemcpyDeviceToDevice,
                                reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+      platform::GpuMemcpyAsync(dst,
+                               src,
+                               num,
+                               musaMemcpyDeviceToDevice,
+                               reinterpret_cast<gpuStream_t>(stream));
 #else
       platform::GpuMemcpyAsync(dst,
                                src,
@@ -416,6 +451,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                          1);
 #ifdef PADDLE_WITH_HIP
       platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToDevice);
 #else
       platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
 #endif
@@ -502,6 +539,12 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                              num,
                              hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -515,6 +558,8 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                                        1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
 #endif
@@ -544,6 +589,12 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                              num,
                              hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -557,6 +608,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                                        1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
 #endif
@@ -746,7 +799,8 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
       dst_place.GetType() == phi::AllocationType::CPU) {
     std::memcpy(dst, src, num);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
            dst_place.GetType() == phi::AllocationType::GPUPINNED) {
     std::memcpy(dst, src, num);
diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc
index 081f0d3d78c13..e51859e791a08 100644
--- a/paddle/fluid/memory/memory_stats_test.cc
+++ b/paddle/fluid/memory/memory_stats_test.cc
@@ -40,7 +40,8 @@ TEST(stat_allocator_test, host_memory_stat_test) {
   EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 TEST(stat_allocator_test, device_memory_stat_test) {
   std::vector<int64_t> alloc_sizes{
       5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 6ec8d77da2c85..7adcc1e09c24a 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -16,6 +16,10 @@ limitations under the License. */
 #include "cub/cub.cuh"
 #endif
 
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
+
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index d1dc7d8986bec..8cdb244d8af6f 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -55,7 +55,8 @@ struct ArrayToLoDFunctor {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       Apply(static_cast<phi::GPUContext *>(pool.Get(place)));
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 012edde57294a..1272a83b2b147 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -19,6 +19,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index f63baadbde526..2c4b4f1ceacf6 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -19,6 +19,11 @@
 #include <hipcub/hipcub.hpp>
 typedef hiprandState curandState;
 namespace cub = hipcub;
+
+#elif defined(PADDLE_WITH_MUSA)
+#include <murand.h>
+#include <murand_kernel.h>
+#include <cub/cub.cuh>
 #else
 #include <curand.h>
 #include <curand_kernel.h>
@@ -72,6 +77,11 @@ __global__ void RandomSampleClassCenter(const int64_t n,
   CUDA_KERNEL_LOOP(i, n) {
     buffer[i] = static_cast<T>(hiprand(&localState) % max_val);
   }
+#elif defined(PADDLE_WITH_MUSA)
+  murand_init(local_seed, id, increment, &localState);
+  CUDA_KERNEL_LOOP(i, n) {
+    buffer[i] = static_cast<T>(murand(&localState) % max_val);
+  }
 #else
   curand_init(local_seed, id, increment, &localState);
   CUDA_KERNEL_LOOP(i, n) {
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
index e100397924af5..9e562cbf58dfe 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -39,7 +39,9 @@ template <typename T, typename DeviceContext>
 class CSyncCalcStreamKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+     defined(PADDLE_WITH_MUSA)) &&                            \
+    !defined(_WIN32)
 
     auto place = ctx.GetPlace();
     auto dev_ctx = static_cast<phi::GPUContext*>(
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index bacbe014a343c..e5c918f0be9d0 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -60,6 +60,9 @@ class CWaitCommOp : public framework::OperatorBase {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index 34569b0a4b600..5276b1b15bcf8 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -61,6 +61,9 @@ class CWaitComputeOp : public framework::OperatorBase {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index 0f04a295ed263..7db9932d99a98 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -77,7 +77,8 @@ class ConditionalOp : public framework::OperatorBase {
                           ips[0]->numel()));
     bool res = false;
     if (platform::is_gpu_place(ips[0]->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       phi::DenseTensor cpu_tensor;
       framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
       platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index c2deeb4190986..e0748c008a564 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -218,7 +218,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     ALL_LAYOUT,
     paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     feed_sparse_coo_tensor,
     GPU,
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 9f67b1d4b6e18..9bbe605c8ccb6 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -26,7 +26,8 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 }  // namespace paddle
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
@@ -34,7 +35,8 @@ namespace paddle {
 namespace operators {
 
 static size_t CUDADevCount() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   return platform::GetGPUDeviceCount();
 #else
   return 0UL;
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index 6ae32f33e957a..3d25edfe2e130 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -228,7 +228,8 @@ bool GetCondData(const phi::DenseTensor &cond) {
   // platform::is_xpu_place(cond.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
   framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index adb60a8a8d064..c20b691a4300c 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -19,6 +19,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index b2bbd9c82095c..eba1c5127b8a9 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -12,6 +12,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
index 484bd8454bae9..b01813a0cfc27 100644
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -120,7 +120,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
     int64_t k = x->dims()[2];
 
     auto x_lod = x->lod().back();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     phi::MixVector<size_t> mixv_x_lod(&x_lod);
     size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace());
 #else
@@ -137,7 +138,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
                                        k,
                                        out_data,
                                        out_wt_data);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     mixv_x_lod.CopyToCPU();
 #endif
 
@@ -154,7 +156,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
               "TargetAssignOp input(NegIndices) needs 1 level of LoD"));
       const int* neg_idx_data = neg_indices->data<int>();
       auto neg_lod = neg_indices->lod().back();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       phi::MixVector<size_t> mixv_neg_lod(&neg_lod);
       size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace());
 #else
@@ -170,7 +173,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
                       mismatch_value,
                       out_data,
                       out_wt_data);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       mixv_neg_lod.CopyToCPU();
 #endif
     }
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index 45f34313d1a3d..c0799c4c861c4 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -188,7 +188,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
 
     int buf_size = paddle::communication::dgc::get_buffer_size(k);
     paddle::memory::allocation::AllocationPtr tmp_ious_data;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(dev_ctx.GetPlace())) {
       tmp_ious_data = memory::Alloc(
           dev_ctx.GetPlace(),
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index c69acb89750c9..1feb5a5e1fc71 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -32,9 +32,11 @@ limitations under the License. */
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #ifdef __NVCC__
 #include <cuda.h>
+#elif defined(__MUSACC__)
+#include <musa.h>
 #elif defined(__HIPCC__)
 #include <hip/hip_runtime.h>
 #endif
@@ -311,7 +313,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T,
           typename CompoundFunctor,
           bool BcastY,
@@ -516,7 +518,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
     int h = pre;
     int w = n;
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActBroadcast1CUDA<T,
                                         CompoundFunctor,
                                         BcastY,
@@ -551,7 +553,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
     }
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActBroadcast2CUDA<T,
                                         CompoundFunctor,
                                         BcastY,
@@ -880,7 +882,7 @@ static void FusedElemwiseAndActGradBroadcast2CPU(
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T,
           typename DX_OP,
           typename DY_OP,
@@ -1273,7 +1275,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
     int w = n;
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActGradBroadcast1CUDA<T,
                                             DX_OP,
                                             DY_OP,
@@ -1324,7 +1326,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
     }
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActGradBroadcast2CUDA<T,
                                             DX_OP,
                                             DY_OP,
@@ -1594,7 +1596,7 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
   return phi::funcs::GetReduceDim(in, out, axis);
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 template <typename T, typename Functor>
 void GetGradXAndYOut(const phi::GPUContext &dev_ctx,
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 107fe9f6174b6..82195e874f1b6 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -155,7 +155,8 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad,
                        ops::ExpandAsGradKernel<phi::CPUContext, int64_t>,
                        ops::ExpandAsGradKernel<phi::CPUContext, float>,
                        ops::ExpandAsGradKernel<phi::CPUContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(expand_as,
                         ops::ExpandAsKernel<phi::GPUContext, float>,
                         ops::ExpandAsKernel<phi::GPUContext, double>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index fee4b47049301..5cb29c1d48dad 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -283,7 +283,8 @@ REGISTER_OP_CPU_KERNEL(expand_grad,
                        ops::ExpandGradKernel<phi::CPUContext, double>,
                        ops::ExpandGradKernel<phi::CPUContext, int>,
                        ops::ExpandGradKernel<phi::CPUContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(
     expand,
     ops::ExpandKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index b6dd3ca8f64b2..507cbc0d31d3a 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -192,6 +192,8 @@ struct FindChannelAbsMaxFunctor<phi::GPUContext, T> {
 
 #ifdef PADDLE_WITH_HIP
       hipMemset(out_abs_max, 0, sizeof(T) * cout);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(out_abs_max, 0, sizeof(T) * cout);
 #else
       cudaMemset(out_abs_max, 0, sizeof(T) * cout);
 #endif  // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 53001b2493084..db5c1ddcbb375 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -17,6 +17,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 35574331e17d7..65d4ae2d4c5ec 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -46,6 +46,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
     int device_id;
 #ifdef PADDLE_WITH_HIP
     hipGetDevice(&device_id);
+#elif defined(PADDLE_WITH_MUSA)
+    musaGetDevice(&device_id);
 #else
     cudaGetDevice(&device_id);
 #endif
@@ -76,6 +78,17 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
                    sizeof(int64_t) * input_num,
                    hipMemcpyHostToDevice,
                    device_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemcpyAsync(in_ids_d,
+                    in1s.data(),
+                    sizeof(int64_t) * input_num,
+                    musaMemcpyHostToDevice,
+                    device_ctx.stream());
+    musaMemcpyAsync(in_embs_d,
+                    in2s.data(),
+                    sizeof(int64_t) * input_num,
+                    musaMemcpyHostToDevice,
+                    device_ctx.stream());
 #else
     cudaMemcpyAsync(in_ids_d,
                     in1s.data(),
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index dee676a7640f4..4eea6ab366fb6 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -15,6 +15,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index 362860aa23bdf..89838e6084ab3 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -150,6 +150,34 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
                            lods.size() * sizeof(size_t *),
                            hipMemcpyHostToDevice,
                            stream);
+#elif defined(PADDLE_WITH_MUSA)
+  T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
+  platform::GpuMemcpyAsync(gpu_input_values,
+                           input_data.data(),
+                           input_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+  T **gpu_output_values =
+      reinterpret_cast<T **>(&gpu_input_values[input_data.size()]);
+  platform::GpuMemcpyAsync(gpu_output_values,
+                           output_data.data(),
+                           output_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+  T **gpu_seqpool_output_values =
+      reinterpret_cast<T **>(&gpu_output_values[output_data.size()]);
+  platform::GpuMemcpyAsync(gpu_seqpool_output_values,
+                           seqpool_output_data.data(),
+                           seqpool_output_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+  size_t **lods_values = reinterpret_cast<size_t **>(
+      &gpu_seqpool_output_values[seqpool_output_data.size()]);
+  platform::GpuMemcpyAsync(lods_values,
+                           lods.data(),
+                           lods.size() * sizeof(size_t *),
+                           musaMemcpyHostToDevice,
+                           stream);
 #else
   T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_input_values,
@@ -356,6 +384,37 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
                            lods.size() * sizeof(size_t *),
                            hipMemcpyHostToDevice,
                            stream);
+#elif defined(PADDLE_WITH_MUSA)
+  T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
+  platform::GpuMemcpyAsync(gpu_out_grads_values,
+                           out_grads_data.data(),
+                           out_grads_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+
+  T **gpu_in_grads_values =
+      reinterpret_cast<T **>(&gpu_out_grads_values[out_grads_data.size()]);
+  platform::GpuMemcpyAsync(gpu_in_grads_values,
+                           in_grads_data.data(),
+                           in_grads_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+
+  T **gpu_cvm_values =
+      reinterpret_cast<T **>(&gpu_in_grads_values[in_grads_data.size()]);
+  platform::GpuMemcpyAsync(gpu_cvm_values,
+                           cvm_data.data(),
+                           cvm_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+
+  size_t **lods_values =
+      reinterpret_cast<size_t **>(&gpu_cvm_values[cvm_data.size()]);
+  platform::GpuMemcpyAsync(lods_values,
+                           lods.data(),
+                           lods.size() * sizeof(size_t *),
+                           musaMemcpyHostToDevice,
+                           stream);
 #else
   T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_out_grads_values,
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index 8402bc78ef64c..5ee1ce015386f 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -329,6 +329,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
           &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
 #ifdef PADDLE_WITH_HIP
       hipMemset(temp_qk_bias, 0, sizeof(float) * size);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(temp_qk_bias, 0, sizeof(float) * size);
 #else
       cudaMemset(temp_qk_bias, 0, sizeof(float) * size);
 #endif
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
index 72bb97a2aae9e..72077400d5d2a 100644
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -255,6 +255,9 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(
       bbox_count_device_ptr, &bbox_count, sizeof(int), hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(
+      bbox_count_device_ptr, &bbox_count, sizeof(int), musaMemcpyHostToDevice);
 #else
   cudaMemcpy(
       bbox_count_device_ptr, &bbox_count, sizeof(int), cudaMemcpyHostToDevice);
@@ -268,6 +271,9 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(
       &bbox_count, bbox_count_device_ptr, sizeof(int), hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(
+      &bbox_count, bbox_count_device_ptr, sizeof(int), musaMemcpyDeviceToHost);
 #else
   cudaMemcpy(
       &bbox_count, bbox_count_device_ptr, sizeof(int), cudaMemcpyDeviceToHost);
@@ -283,6 +289,9 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
     hipFree(bbox_tensor);
     hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
+#elif defined(PADDLE_WITH_MUSA)
+    musaFree(bbox_tensor);
+    musaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
 #else
     cudaFree(bbox_tensor);
     cudaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
@@ -296,6 +305,9 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(
       bbox_index_device_ptr, &bbox_index, sizeof(int), hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(
+      bbox_index_device_ptr, &bbox_index, sizeof(int), musaMemcpyHostToDevice);
 #else
   cudaMemcpy(
       bbox_index_device_ptr, &bbox_index, sizeof(int), cudaMemcpyHostToDevice);
@@ -356,6 +368,13 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
               anchors.data(),
               anchors.size() * sizeof(int),
               hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&device_anchors),
+               anchors.size() * sizeof(int));
+    musaMemcpy(device_anchors,
+               anchors.data(),
+               anchors.size() * sizeof(int),
+               musaMemcpyHostToDevice);
 #else
     cudaMalloc(reinterpret_cast<void**>(&device_anchors),
                anchors.size() * sizeof(int));
@@ -388,6 +407,10 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
       hipMalloc(
           reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
           ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMalloc(
+          reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
+          ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
 #else
       cudaMalloc(
           reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
@@ -398,6 +421,9 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       hipMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
                 sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
+                 sizeof(int));
 #else
       cudaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
                  sizeof(int));
@@ -409,6 +435,8 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     int* bbox_index_device_ptr;
 #ifdef PADDLE_WITH_HIP
     hipMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
 #else
     cudaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
 #endif
@@ -456,6 +484,12 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
             ts_info[ts_id].bboxes_dev_ptr,
             ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
             hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpyAsync(
+            ts_info[ts_id].bboxes_host_ptr,
+            ts_info[ts_id].bboxes_dev_ptr,
+            ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
+            musaMemcpyDeviceToHost);
 #else
         cudaMemcpyAsync(
             ts_info[ts_id].bboxes_host_ptr,
@@ -534,6 +568,8 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_HIP
     hipFree(bbox_index_device_ptr);
+#elif defined(PADDLE_WITH_MUSA)
+    musaFree(bbox_index_device_ptr);
 #else
     cudaFree(bbox_index_device_ptr);
 #endif
@@ -541,6 +577,9 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       hipFree(ts_info[i].bboxes_dev_ptr);
       hipFree(ts_info[i].bbox_count_device_ptr);
+#elif defined(PADDLE_WITH_MUSA)
+      musaFree(ts_info[i].bboxes_dev_ptr);
+      musaFree(ts_info[i].bbox_count_device_ptr);
 #else
       cudaFree(ts_info[i].bboxes_dev_ptr);
       cudaFree(ts_info[i].bbox_count_device_ptr);
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index 32e7cffa4984b..4065fd1e017ea 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -31,6 +31,10 @@ limitations under the License. */
 #include <cuda.h>
 #include <curand_kernel.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <murand_kernel.h>
+#include <musa.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
diff --git a/paddle/fluid/operators/fused_token_prune_op.cu b/paddle/fluid/operators/fused_token_prune_op.cu
index 8f0a53611f3b2..4ff5fd33df3d6 100644
--- a/paddle/fluid/operators/fused_token_prune_op.cu
+++ b/paddle/fluid/operators/fused_token_prune_op.cu
@@ -14,6 +14,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index 8ae92b04b7df4..853540f7a2b9b 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -111,7 +111,8 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index e533960c8a648..0e96f7164e913 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -32,6 +32,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <murand_kernel.h>
+#include <musa_runtime.h>
 #else
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
@@ -95,6 +98,12 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
                threadIdx.y * WARP_SIZE + threadIdx.x,
                0,
                &rng);
+#elif defined(PADDLE_WITH_MUSA)
+  murandState rng;
+  murand_init(rand_seed * gridDim.x + blockIdx.x,
+              threadIdx.y * WARP_SIZE + threadIdx.x,
+              0,
+              &rng);
 #else
   curandState rng;
   curand_init(rand_seed * gridDim.x + blockIdx.x,
@@ -128,6 +137,8 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
       for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
+#elif defined(PADDLE_WITH_MUSA)
+        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index dea3ce3fe695b..01d9642a49404 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -156,7 +156,8 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 8c123bb8a32f2..56fa8cfc4b0cd 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -201,7 +201,8 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index aab7953d6d103..b6a8b52c04083 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -67,7 +67,8 @@ bool TensorIsfinite(const phi::DenseTensor& tensor);
 FiniteVisitor(Isnan, Any, CPU);
 FiniteVisitor(Isinf, Any, CPU);
 FiniteVisitor(Isfinite, All, CPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 FiniteVisitor(Isnan, Any, GPU);
 FiniteVisitor(Isinf, Any, GPU);
 FiniteVisitor(Isfinite, All, GPU);
@@ -82,7 +83,8 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor,
                         IsnanVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsnanVisitorGPU(tensor, out));
@@ -99,7 +101,8 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor,
                         IsinfVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsinfVisitorGPU(tensor, out));
@@ -116,7 +119,8 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor,
                         IsfiniteVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsfiniteVisitorGPU(tensor, out));
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 92f190c0025ed..c859183fd9661 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -96,7 +96,8 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index dd85ccff87f2d..fc3845703bef4 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -133,7 +133,8 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 94b0319729117..05a6a5c86831c 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -66,7 +66,8 @@ struct LoDTensorToArrayFunctor {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<phi::CPUContext *>(dev_ctx));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       Apply(static_cast<phi::GPUContext *>(dev_ctx));
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 11c35293ebe34..b1282585bda6e 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -221,6 +221,9 @@ struct LookupTableV2GradCUDAFunctor {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index d741bc5b42549..4829c8f6c46c9 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -16,6 +16,8 @@
 #ifdef PADDLE_WITH_HIP
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
+#elif defined(PADDLE_WITH_MUSA)
+#include <cub/cub.cuh>
 #else
 #include <cub/cub.cuh>
 #endif
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index af14333b9d1ea..7b14e8541fd02 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -21,5 +21,6 @@ endif()
 
 math_library(unpooling)
 math_library(prelu)
-math_library(bert_encoder_functor)
+# TODO(@caizhi): enable it
+#math_library(bert_encoder_functor)
 math_library(tree2col DEPS phi)
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index 9a0b5a1ae3ab7..91fdcf82e83d0 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -20,6 +20,10 @@ limitations under the License. */
 
 #include <cub/cub.cuh>  // NOLINT
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 
@@ -47,7 +51,8 @@ struct CUDATypeTraits<float> {
   typedef float TYPE;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 // This functor involves a fusion calculation in Ernie or Bert.
 //  The fusion mode is as follows:
 //
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 3032b78a2029d..0d8049023d2cd 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -17,6 +17,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
index 00ff1fbcbc38d..04e390499cb7f 100644
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
@@ -23,7 +23,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 class PreluChannelWiseDirectCUDAFunctor {
  public:
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index 0c6b49729546c..5ab90409df1e7 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -160,6 +160,11 @@ void GPUSampleWithProb<T>::operator()(const phi::GPUContext& context,
                                        s_data,
                                        sizeof(int64_t) * num_samples,
                                        hipMemcpyHostToDevice));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(samples_data + num_true,
+                                        s_data,
+                                        sizeof(int64_t) * num_samples,
+                                        musaMemcpyHostToDevice));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true,
                                         s_data,
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index 7c60be6841552..d1487d9c57360 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -106,7 +106,8 @@ class SampleWithProb {
   }
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 class GPUSampleWithProb {
  public:
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index e1a36fa41894d..af41335bffa86 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -75,7 +75,7 @@ class MatMulKernel : public framework::OpKernel<T> {
 
     int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     head_number = context.Attr<int>("head_number");
 #endif
 
@@ -89,7 +89,7 @@ class MatMulKernel : public framework::OpKernel<T> {
       }
     }
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_);
 
     if (head_number > 1) {
@@ -241,7 +241,7 @@ class MatMulGradKernel : public framework::OpKernel<T> {
 
     int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     if (context.HasAttr("head_number")) {
       head_number = context.Attr<int>("head_number");
     }
@@ -373,7 +373,7 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
 
     int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     head_number = context.Attr<int>("head_number");
 #endif
 
@@ -615,7 +615,7 @@ class MatMulOp : public framework::OperatorWithKernel {
     }
     int64_t dim_out_y = mat_dim_y.width_;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     int head_number = context->Attrs().Get<int>("head_number");
     bool split_vertical_y = (mat_dim_x.width_ != mat_dim_y.height_);
     if (context->IsRuntime()) {
@@ -758,7 +758,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsExtra();
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     AddAttr<int>("head_number", "The number of heads of the matrix")
         .SetDefault(1);
 #endif
@@ -926,7 +926,8 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, float>,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, double>);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(
     matmul,
     ops::MatMulKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 5f480461d77cd..fff8b36d68405 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -39,7 +39,8 @@ class MemcpyH2DFunctor {
 
   void operator()(const phi::DenseTensor &lod_tensor) const {
     auto &out_tensor = *out_->GetMutable<phi::DenseTensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     auto stream = static_cast<const phi::GPUContext *>(&dev_ctx_)->stream();
 #else
     auto stream = nullptr;
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 007f853f3243f..afa281f3679cc 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -68,7 +68,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       framework::TensorCopy(
           mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 8c33a5da1baff..1de4eed001b92 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -157,6 +157,7 @@ REGISTER_OPERATOR(minus,
                   ops::MinusGradMaker);
 PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h
index 01905d8ca84b3..70342339a55a1 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
+#elif defined(PADDLE_WITH_MCCL)
+#include "paddle/fluid/platform/dynload/mccl.h"
 #else
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc
index 69f0bfb2abcd3..cc3bbe8eac3ac 100644
--- a/paddle/fluid/operators/nop_op.cc
+++ b/paddle/fluid/operators/nop_op.cc
@@ -60,6 +60,7 @@ REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker);
 
 PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index cad7e38ba1c1a..411988f4f0560 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -32,6 +32,10 @@
 #include "cub/cub.cuh"
 #include "math.h"  // NOLINT
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#include "math.h"  // NOLINT
+#endif
 
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
@@ -53,6 +57,8 @@ static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
   static_assert(!std::is_same<T, void>::value, "T cannot be void.");
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(x, 0, n * sizeof(T), stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream));
 #endif
@@ -254,6 +260,10 @@ static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) {
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
       &cpu_value, ptr, sizeof(float), hipMemcpyDeviceToHost, stream));
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(
+      &cpu_value, ptr, sizeof(float), musaMemcpyDeviceToHost, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
       &cpu_value, ptr, sizeof(float), cudaMemcpyDeviceToHost, stream));
@@ -1133,6 +1143,10 @@ static std::string GetMinMaxStr(const T *x, size_t n, const phi::Place &place) {
     PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
         &ret_cpu[0], ret, 2 * sizeof(T), hipMemcpyDeviceToHost, stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(
+        &ret_cpu[0], ret, 2 * sizeof(T), musaMemcpyDeviceToHost, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
         &ret_cpu[0], ret, 2 * sizeof(T), cudaMemcpyDeviceToHost, stream));
@@ -1189,6 +1203,12 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) {
                                             sizeof(flag),
                                             hipMemcpyDeviceToHost,
                                             dev_ctx.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(&flag,
+                                             out.Get<bool>(),
+                                             sizeof(flag),
+                                             musaMemcpyDeviceToHost,
+                                             dev_ctx.stream()));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&flag,
                                              out.Get<bool>(),
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index f1b162be46610..1f3ae2f9e318e 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -28,6 +28,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -460,7 +463,7 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
         grad_index.mutable_data<IndexT>({num_index}, ctx.GetPlace());
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       auto sort_value_ptr =
           sort_value.mutable_data<IndexT>({num_index}, ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index d00cefab45045..500c375212bf9 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -260,7 +260,8 @@ PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index e2417a071ce88..a10f59f8a2fbe 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #endif
 
@@ -85,7 +85,7 @@ inline HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data,
   return sum_out;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 DEVICE void PrRoIPoolingDistributeDiff(T* diff,
                                        const T top_diff,
@@ -163,7 +163,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff,
   PrRoIPoolingDistributeDiff<T>(diff, top_diff, e_h, e_w, h0, w0, tmp);
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 DEVICE void AccumulateRois(T* offset, T data) {
   phi::CudaAtomicAdd(offset, data);
@@ -175,7 +175,7 @@ inline HOSTDEVICE void AccumulateRois(T* offset, T data) {
 }
 #endif
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 DEVICE T MaxFunctor(const T x, const T y) {
   return max(x, y);
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc
index 99e8d04a9e329..eccc679666c58 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
@@ -107,7 +107,8 @@ PD_REGISTER_STRUCT_KERNEL(send_and_recv,
                           double,
                           int,
                           int64_t) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(send_and_recv,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index fc625826b9a91..91f76d2525de3 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -19,7 +19,8 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include <thrust/random.h>
 #endif
 
@@ -37,7 +38,8 @@ struct Random<phi::CPUContext> {
   using UniformIntDist = std::uniform_int_distribution<T>;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <>
 struct Random<phi::GPUContext> {
   using Engine = thrust::minstd_rand;
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index ebdddfd41b33f..712aac0e50716 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -246,7 +246,8 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index a0ad7e3939a02..99caa24a51078 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -48,7 +48,8 @@ BufferedReader::BufferedReader(
       buffer_size_(buffer_size),
       pin_memory_(pin_memory) {
   VLOG(1) << "BufferedReader";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place_) && !pin_memory) {
     int dev_idx = place_.device;
     compute_stream_ =
@@ -118,7 +119,8 @@ void BufferedReader::ReadAsync(size_t i) {
       return -1UL;
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)  // @{ Group GPU Place
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)  // @{ Group GPU Place
     if (platform::is_gpu_place(place_)) {
       TensorVec &cuda = cuda_buffer_[i];
       if (cuda.empty()) {
@@ -197,6 +199,11 @@ void BufferedReader::ReadAsync(size_t i) {
             hipEventRecord(events_[i].get(), compute_stream_));
         PADDLE_ENFORCE_GPU_SUCCESS(
             hipStreamWaitEvent(stream_.get(), events_[i].get(), 0));
+#elif defined(PADDLE_WITH_MUSA)
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            musaEventRecord(events_[i].get(), compute_stream_));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            musaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
 #else
         PADDLE_ENFORCE_GPU_SUCCESS(
             cudaEventRecord(events_[i].get(), compute_stream_));
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 032a74b7e23f1..ff902cc66445b 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -21,7 +21,8 @@
 
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/reader.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
@@ -80,7 +81,8 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> xpu_buffer_;
   std::vector<TensorVec> custom_device_buffer_;
   size_t prev_pos_{-1UL};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   gpuStream_t compute_stream_;
   std::shared_ptr<platform::CudaStreamObject> stream_;
   std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 962b18c995979..b14eef3f29beb 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -429,7 +429,8 @@ class ReshapeKernel {
                               pt_scalar_shape,
                               out);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeInferKernel(static_cast<const phi::GPUContext &>(dev_ctx),
@@ -462,7 +463,8 @@ class ReshapeGradKernel {
       phi::ReshapeGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, d_x);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeGradKernel(
@@ -492,7 +494,8 @@ class ReshapeDoubleGradKernel {
       phi::ReshapeDoubleGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, *dd_x, dd_out);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeDoubleGradKernel(
@@ -761,7 +764,8 @@ REGISTER_OPERATOR(reshape2_grad_grad,
                   ops::ReshapeDoubleGradOpNoNeedBufferVarInferer,
                   Reshape2DoubleGradInferShapeFunctor);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape,
                                 float,
                                 ops::ReshapeKernel,
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index bc1f5a0d34f60..e786ea83fad73 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -117,7 +117,8 @@ PD_REGISTER_KERNEL(save_sr,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(save,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h
index 2b7f884f6170c..4843492101b05 100644
--- a/paddle/fluid/operators/select_op_helper.h
+++ b/paddle/fluid/operators/select_op_helper.h
@@ -39,8 +39,9 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) {
   }
   // when platform::is_gpu_place(mask.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_mask{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||           \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE) || \
+    defined(PADDLE_WITH_XPU)
   framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get());
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index 2236988025cbc..4a715d0e35972 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -136,7 +136,8 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     const size_t *lod;
     size_t lod_count = x.lod()[0].size();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto xlod = x.lod()[0];
       phi::MixVector<size_t> mixv_xlod(&xlod);
@@ -144,7 +145,8 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     } else {
 #endif
       lod = x.lod()[0].data();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     }
 #endif
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index 0ca5514900d46..77f729e0f91ca 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #include <string>
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index 897ff207f5eca..7411ecc05358c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -18,6 +18,10 @@ limitations under the License. */
 #include <cub/cub.cuh>
 #endif
 
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
+
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu
index 5069cf1e512cb..c4235a17f9918 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cu
+++ b/paddle/fluid/operators/shuffle_batch_op.cu
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifndef _MSC_VER
 #include <thrust/device_ptr.h>
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index e648575a1edca..27f947f434a07 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -69,7 +69,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       framework::TensorCopy(
           mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index ccf5cd09a0842..e0004f197cd55 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -478,7 +478,7 @@ struct DeviceIndependenceTensorOperations {
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
     ret.Resize(phi::make_ddim(out_shape));
     if (platform::is_gpu_place(context.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       // For GPU, there is no need to define XxxInverseFunctor and call
       // ElementwiseComputeEx in two branches.
       ElementwiseComputeEx<SubFunctor<InT>, DeviceContext, InT>(
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu
index 84e30250f85fd..1b24ea8276e24 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
@@ -320,6 +320,24 @@ PD_REGISTER_KERNEL(sync_batch_norm,
     kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
   }
 }
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(sync_batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormKernel,
+                   float,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
 #else
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(sync_batch_norm,
@@ -376,6 +394,18 @@ PD_REGISTER_KERNEL(sync_batch_norm_grad,
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
 }
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(sync_batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
+  }
+}
 #else
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(sync_batch_norm_grad,
@@ -404,6 +434,12 @@ PD_REGISTER_KERNEL(sync_batch_norm_coo,
                    phi::sparse::SyncBatchNormCooKernel,
                    float,
                    phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(sync_batch_norm_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooKernel,
+                   float,
 #else
 PD_REGISTER_KERNEL(sync_batch_norm_coo,
                    GPU,
@@ -421,6 +457,12 @@ PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
                    phi::sparse::SyncBatchNormCooGradKernel,
                    float,
                    phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooGradKernel,
+                   float,
 #else
 PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
                    GPU,
diff --git a/paddle/fluid/operators/sync_batch_norm_utils.h b/paddle/fluid/operators/sync_batch_norm_utils.h
index 7c14f6dfac324..ebc825b66a5ef 100644
--- a/paddle/fluid/operators/sync_batch_norm_utils.h
+++ b/paddle/fluid/operators/sync_batch_norm_utils.h
@@ -22,6 +22,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index f1674bc5005a0..fede7fe5156d0 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -18,6 +18,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 #endif
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 16bce515f2a7f..12725c397faf6 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -19,7 +19,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/random.h>
 
 #include "paddle/phi/core/generator.h"
@@ -113,7 +113,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
   return vec_new_shape;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 template <typename T>
 struct UniformGenerator {
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 4d7f496aaa42d..8b7c77d720fed 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -64,7 +64,9 @@ if(WITH_DGC)
   set(dgc_deps dgc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 endif()
 
@@ -90,8 +92,14 @@ if(WITH_ROCM)
     SRCS stream_callback_manager.cc
     DEPS simple_threadpool enforce)
 endif()
+if(WITH_MUSA)
+  musa_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS
+               simple_threadpool enforce)
+endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
 else()
   set(STREAM_CALLBACK_DEPS)
@@ -137,7 +145,9 @@ cc_library(
   SRCS collective_helper.cc gen_comm_id_helper.cc
   DEPS framework_proto device_context enforce)
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   target_link_libraries(device_context gpu_resource_pool)
 endif()
 
@@ -235,6 +245,13 @@ if(WITH_ROCM)
     DEPS device_context gpu_info)
 endif()
 
+if(WITH_MUSA)
+  musa_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base)
+  set(DEVICE_EVENT_LIBS
+      device_event_gpu
+      CACHE INTERNAL "device event libs")
+endif()
+
 cc_library(timer SRCS timer.cc)
 cc_test(
   timer_test
@@ -281,6 +298,20 @@ elseif(WITH_ROCM)
          stats
          op_proto_maker
          shape_inference)
+elseif(WITH_MUSA)
+  musa_library(
+    profiler
+    SRCS
+    profiler.cc
+    profiler.cu
+    DEPS
+    phi
+    gpu_info
+    enforce
+    new_profiler
+    stats
+    op_proto_maker
+    shape_inference)
 elseif(WITH_XPU)
   cc_library(
     profiler
@@ -339,6 +370,10 @@ if(WITH_GPU)
     DEPS gpu_info)
 endif()
 
+if(WITH_MUSA)
+  musa_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+endif()
+
 if(WITH_ROCM)
   hip_test(
     float16_gpu_test
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index b133a57d523ac..a6c2b9d61dd2b 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -174,6 +174,8 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
     for (int i = 0; i < kDevices; i++) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i));
 #endif
diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu
index b814bcde6841f..b78e4332d4bb0 100644
--- a/paddle/fluid/platform/complex_test.cu
+++ b/paddle/fluid/platform/complex_test.cu
@@ -27,7 +27,8 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index 6f0d86f0a4b17..10f7143028225 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,7 +1,9 @@
 set(DEV_LIBS custom_device)
 
 # GPU
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   add_subdirectory(gpu)
 endif()
 
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index aa2dba03c9082..4a984cb34aae8 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index 897f8d3732b73..85a86ae8ecedd 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -22,6 +22,17 @@ elseif(WITH_ROCM)
     cudnn_desc_test
     SRCS cudnn_desc_test.cc
     DEPS dynload_cuda)
+elseif(WITH_MUSA)
+  musa_library(
+    gpu_info
+    SRCS
+    gpu_info.cc
+    DEPS
+    phi
+    glog
+    enforce
+    monitor
+    dynload_cuda)
 endif()
 
 cc_library(
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
index 878a122a49224..e6cac0e084ee5 100644
--- a/paddle/fluid/platform/device/gpu/gpu_helper.h
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -13,11 +13,12 @@
 // limitations under the License.
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
-#else
+#elif defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h"
 #include "paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h"
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 7f1f2c76bd630..ea85562ababb6 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -35,6 +35,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
+#elif defined(PADDLE_WITH_MUSA)
+// TODO(Xiaokang Shang)
 #else
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
@@ -216,6 +218,12 @@ class RecordedGpuMallocHelper {
     } else {
       result = hipMalloc(ptr, size);
     }
+#elif defined(PADDLE_WITH_MUSA)
+    if (UNLIKELY(malloc_managed_memory)) {
+      result = musaMallocManaged(ptr, size);
+    } else {
+      result = musaMalloc(ptr, size);
+    }
 #else
     phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
     if (UNLIKELY(malloc_managed_memory)) {
@@ -262,6 +270,9 @@ class RecordedGpuMallocHelper {
 #ifdef PADDLE_WITH_HIP
     auto err = hipFree(ptr);
     if (err != hipErrorDeinitialized) {
+#elif defined(PADDLE_WITH_MUSA)
+    auto err = musaFree(ptr);
+    if (err != musaErrorInvalidValue) {
 #else
     auto err = cudaFree(ptr);
     VLOG(10) << "[cudaFree] size=" << static_cast<double>(size) / (1 << 20)
@@ -309,6 +320,8 @@ class RecordedGpuMallocHelper {
       CUDADeviceGuard guard(dev_id_);
 #ifdef PADDLE_WITH_HIP
       auto result = hipMemGetInfo(actual_avail, actual_total);
+#elif defined(PADDLE_WITH_MUSA)
+      auto result = musaMemGetInfo(actual_avail, actual_total);
 #else
       auto result = cudaMemGetInfo(actual_avail, actual_total);
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index de68329bba66d..3d76f09da559b 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -11,7 +11,8 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #include <stddef.h>
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index d253a92c986ce..adde00d2f1b7a 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -16,10 +16,13 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index 9f2168e1cdb8b..7cf3659d596e9 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -30,6 +31,9 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaStreamCreateWithFlags(&stream, musaStreamNonBlocking));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
@@ -41,6 +45,8 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
 #endif
@@ -82,6 +88,9 @@ CudaEventResourcePool::CudaEventResourcePool() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventCreateWithFlags(&event, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
@@ -93,6 +102,8 @@ CudaEventResourcePool::CudaEventResourcePool() {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
index 2ac13e692f783..298e795524b4a 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
@@ -14,13 +14,19 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index c9afafdef7166..43a08c3c3b911 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -15,7 +15,8 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
@@ -23,6 +24,10 @@
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
 
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
+#include "paddle/fluid/platform/dynload/mublas.h"
+using mudnnHandle_t = class Handle*;
 #else
 #include <cuda_runtime.h>
 
@@ -34,19 +39,49 @@
 namespace paddle {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
+
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+  using GPU_TYPE = MUSA_TYPE;
 #else  // CDUA
 
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
 #endif
 
-DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
-DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
-DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
-DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind,
+                     cudaMemcpyKind,
+                     hipMemcpyKind,
+                     musaMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp,
+                     cudaDeviceProp,
+                     hipDeviceProp_t,
+                     musaDeviceProp);
+
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
+DECLARE_TYPE_FOR_GPU(blasHandle_t,
+                     cublasHandle_t,
+                     rocblas_handle,
+                     mublasHandle_t);
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+#undef DECLARE_TYPE_FOR_GPU
+
+// TODO(Xiaokang Shang): confirm mudnn type
+#ifndef PADDLE_WITH_MUSA
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+#elif defined(PADDLE_WITH_CUDA)
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
 
 DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
 DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
@@ -80,32 +115,33 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
 DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
                      cudnnDropoutDescriptor_t,
                      miopenDropoutDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
-
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
-
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
 DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
-
-using CUDAGraphID = unsigned long long;  // NOLINT
-
 #undef DECLARE_TYPE_FOR_GPU
+#endif
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = ROCM_CV;
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
+  constexpr auto GPU_CV = MUSA_CV;
 #else  // CDUA
 
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = CUDA_CV;
 #endif
 
 DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
-                         hipErrorOutOfMemory);
-DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
-DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+                         hipErrorOutOfMemory,
+                         musaErrorMemoryAllocation);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady,
+                         cudaErrorNotReady,
+                         hipErrorNotReady,
+                         musaErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
 
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 6afcd2eb7cd97..be988cabdb5d1 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -32,6 +32,9 @@
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
+#ifdef PADDLE_WITH_MCCL
+#include "paddle/fluid/platform/dynload/mccl.h"
+#endif
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index 6b58453f03ea8..3070de23ca219 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -45,7 +45,8 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
 )";
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 TEST(DeviceCode, cuda) {
   if (!phi::dynload::HasNVRTC() || !phi::dynload::HasCUDADriver()) {
     return;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 456abd55ef68f..fac5995371c8d 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -27,7 +27,8 @@ limitations under the License. */
 #include "paddle/phi/core/expect.h"
 #include "paddle/phi/core/generator.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -53,7 +54,8 @@ DeviceType Place2DeviceType(const platform::Place& place) {
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename DevCtx>
 typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
                         DevCtx*>::type
@@ -86,7 +88,8 @@ inline std::unique_ptr<DeviceContext> CreateDeviceContext(
   DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
   auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
   if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
     PADDLE_ENFORCE_NOT_NULL(
         cuda_ctx,
@@ -172,7 +175,8 @@ void EmplaceDeviceContexts(
           /*unused*/ stream_priority);
 #endif
     } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       EmplaceDeviceContext<phi::GPUContext>(
           place_to_device_context,
           place,
@@ -209,7 +213,8 @@ void EmplaceDeviceContexts(
           "option."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       EmplaceDeviceContext<CUDAPinnedDeviceContext>(
           place_to_device_context,
           place,
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index b07b3f29dafde..0e54bab9a6871 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -42,6 +42,17 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
+#include "paddle/fluid/platform/dynload/mublas.h"
+#include "paddle/fluid/platform/dynload/musparse.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#include "paddle/fluid/platform/dynload/mccl.h"
+#endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"  // NOLINT
 #include "paddle/fluid/platform/dynload/miopen.h"
@@ -136,7 +147,8 @@ namespace xpu = baidu::xpu::api;
 using XPUDeviceContext = phi::XPUContext;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
 #endif
 
@@ -165,7 +177,8 @@ struct DefaultDeviceContextType<phi::IPUPlace> {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <>
 struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
   using TYPE = paddle::platform::CUDAPinnedDeviceContext;
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index 402974b89e5c9..2287ffada5872 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -31,7 +31,8 @@ using ::paddle::platform::kXPU;
 USE_EVENT(kCPU)
 USE_EVENT_WAIT(kCPU, kCPU)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 USE_EVENT(kCUDA);
 USE_EVENT_WAIT(kCUDA, kCUDA)
 USE_EVENT_WAIT(kCPU, kCUDA)
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index 37da8daf7fd69..f0bbb411abb89 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -15,7 +15,8 @@
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 namespace paddle {
 namespace platform {
 struct CUDADeviceEventWrapper {
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 4cb3bfdb3adae..95a488e3b9dba 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -23,6 +23,10 @@ if(WITH_ROCM)
   list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
 endif()
 
+if(WITH_MUSA)
+  list(APPEND MUSA_SRCS mublas.cc murand.cc musparse.cc)
+endif()
+
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
 if(NOT APPLE)
@@ -39,6 +43,12 @@ if(NOT APPLE)
       list(APPEND HIP_SRCS cupti.cc)
     endif()
   endif()
+  if(WITH_MUSA)
+    list(APPEND MUSA_SRCS musa_driver.cc musartc.cc)
+    if(WITH_MCCL)
+      list(APPEND MUSA_SRCS mccl.cc)
+    endif()
+  endif()
 endif()
 
 if(TENSORRT_FOUND)
@@ -62,6 +72,12 @@ if(WITH_ROCM)
     dynload_warpctc
     SRCS warpctc.cc
     DEPS dynamic_loader warpctc phi)
+elseif(WITH_MUSA)
+  musa_library(dynload_cuda SRCS ${MUSA_SRCS} DEPS dynamic_loader phi)
+  cc_library(
+    dynload_warpctc
+    SRCS warpctc.cc
+    DEPS dynamic_loader warpctc phi)
 else()
   nv_library(
     dynload_cuda
diff --git a/paddle/fluid/platform/dynload/mccl.cc b/paddle/fluid/platform/dynload/mccl.cc
new file mode 100644
index 0000000000000..ea5df00912dd4
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mccl.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mccl.h b/paddle/fluid/platform/dynload/mccl.h
new file mode 100644
index 0000000000000..2f22f65d699d6
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mccl.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <mccl.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/mccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name)      \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+#define MCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(mcclCommInitAll);             \
+  __macro(mcclGetUniqueId);             \
+  __macro(mcclCommInitRank);            \
+  __macro(mcclCommDestroy);             \
+  __macro(mcclCommCount);               \
+  __macro(mcclCommCuDevice);            \
+  __macro(mcclCommUserRank);            \
+  __macro(mcclAllReduce);               \
+  __macro(mcclBcast);                   \
+  __macro(mcclAllGather);               \
+  __macro(mcclGroupStart);              \
+  __macro(mcclGroupEnd);                \
+  __macro(mcclReduce);                  \
+  __macro(mcclReduceScatter);           \
+  __macro(mcclGetErrorString);          \
+  __macro(mcclBroadcast);               \
+  __macro(mcclGetVersion);              \
+  __macro(mcclSend);                    \
+  __macro(mcclRecv);                    \
+  __macro(mcclRedOpCreatePreMulSum);    \
+  __macro(mcclRedOpDestroy);
+
+MCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mublas.cc b/paddle/fluid/platform/dynload/mublas.cc
new file mode 100644
index 0000000000000..ae98e1a5c01bd
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mublas.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mublas.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mublas.h b/paddle/fluid/platform/dynload/mublas.h
new file mode 100644
index 0000000000000..d958d9ac7c9b6
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mublas.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mublas.h>
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/phi/backends/dynload/mublas.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name)    \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+#define MUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(mublasSaxpy);                   \
+  __macro(mublasDaxpy);                   \
+  __macro(mublasCaxpy);                   \
+  __macro(mublasZaxpy);                   \
+  __macro(mublasSscal);                   \
+  __macro(mublasDscal);                   \
+  __macro(mublasScopy);                   \
+  __macro(mublasDcopy);                   \
+  __macro(mublasSgemv);                   \
+  __macro(mublasDgemv);                   \
+  __macro(mublasCgemv);                   \
+  __macro(mublasZgemv);                   \
+  __macro(mublasSgemm);                   \
+  __macro(mublasDgemm);                   \
+  __macro(mublasCgemm);                   \
+  __macro(mublasZgemm);                   \
+  __macro(mublasHgemm);                   \
+  __macro(mublasSgeam);                   \
+  __macro(mublasDgeam);                   \
+  __macro(mublasDtrsm);                   \
+  __macro(mublasCtrsm);                   \
+  __macro(mublasZtrsm);                   \
+  __macro(mublasCreate);                  \
+  __macro(mublasDestroy);                 \
+  __macro(mublasSetStream);               \
+  __macro(mublasSetPointerMode);          \
+  __macro(mublasGetPointerMode);          \
+  __macro(mublasSgemmBatched);            \
+  __macro(mublasDgemmBatched);            \
+  __macro(mublasCgemmBatched);            \
+  __macro(mublasZgemmBatched);
+
+MUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/murand.cc b/paddle/fluid/platform/dynload/murand.cc
new file mode 100644
index 0000000000000..d1af076066117
--- /dev/null
+++ b/paddle/fluid/platform/dynload/murand.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/murand.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/murand.h b/paddle/fluid/platform/dynload/murand.h
new file mode 100644
index 0000000000000..cf8ecf51595e0
--- /dev/null
+++ b/paddle/fluid/platform/dynload/murand.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <murand.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/murand.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MURAND_WRAP(__name)    \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+#define MURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(murandCreateGenerator);              \
+  __macro(murandSetStream);                    \
+  __macro(murandSetPseudoRandomGeneratorSeed); \
+  __macro(murandGenerateUniform);              \
+  __macro(murandGenerateUniformDouble);        \
+  __macro(murandGenerateNormal);               \
+  __macro(murandDestroyGenerator);
+
+MURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musa_driver.cc b/paddle/fluid/platform/dynload/musa_driver.cc
new file mode 100644
index 0000000000000..8898bd4dfb654
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musa_driver.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/musa_driver.h"
+
+#include "paddle/phi/backends/dynload/musa_driver.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSA_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUDADriver() { return phi::dynload::HasCUDADriver(); }
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h
new file mode 100644
index 0000000000000..261841e8e7384
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musa_driver.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/musa_driver.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern bool HasCUDADriver();
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name)      \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+/**
+ * include all needed musa driver functions
+ **/
+#define PLATFORM_MUSA_ROUTINE_EACH(__macro)             \
+  __macro(muInit);                                      \
+  __macro(muDriverGetVersion);                          \
+  __macro(muGetErrorString);                            \
+  __macro(muModuleLoadData);                            \
+  __macro(muModuleGetFunction);                         \
+  __macro(muModuleUnload);                              \
+  __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \
+  __macro(muLaunchKernel);                              \
+  __macro(muCtxCreate);                                 \
+  __macro(muCtxGetCurrent);                             \
+  __macro(muDeviceGetCount);                            \
+  __macro(muDevicePrimaryCtxGetState);                  \
+  __macro(muDeviceGetAttribute);                        \
+  __macro(muDeviceGet)
+
+PLATFORM_MUSA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musartc.cc b/paddle/fluid/platform/dynload/musartc.cc
new file mode 100644
index 0000000000000..4e15dab9c1359
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musartc.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/musartc.h"
+
+#include "paddle/phi/backends/dynload/musartc.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSARTC_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasNVRTC() { return phi::dynload::HasNVRTC(); }
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musartc.h b/paddle/fluid/platform/dynload/musartc.h
new file mode 100644
index 0000000000000..c383c85d7ab04
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musartc.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mtrtc.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/musartc.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern bool HasNVRTC();
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)     \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+/**
+ * include all needed musartc functions
+ **/
+#define MUSARTC_ROUTINE_EACH(__macro) \
+  __macro(mtrtcVersion);              \
+  __macro(mtrtcGetErrorString);       \
+  __macro(mtrtcCompileProgram);       \
+  __macro(mtrtcCreateProgram);        \
+  __macro(mtrtcDestroyProgram);       \
+  __macro(mtrtcGetMUSA);              \
+  __macro(mtrtcGetMUSASize);          \
+  __macro(mtrtcGetProgramLog);        \
+  __macro(mtrtcGetProgramLogSize)
+
+MUSARTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musparse.cc b/paddle/fluid/platform/dynload/musparse.cc
new file mode 100644
index 0000000000000..b0e8dbb58d569
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musparse.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/musparse.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+#ifdef MUSPARSE_ROUTINE_EACH
+MUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musparse.h b/paddle/fluid/platform/dynload/musparse.h
new file mode 100644
index 0000000000000..758c39104433e
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musparse.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <musa.h>
+#include <musparse.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/musparse.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name)  \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+#if defined(PADDLE_WITH_MUSA)
+#define MUSPARSE_ROUTINE_EACH(__macro)   \
+  __macro(musparseSetStream);            \
+  __macro(musparseCreateMatDescr);       \
+  __macro(musparseSnnz);                 \
+  __macro(musparseDnnz);                 \
+  __macro(musparseSetMatType);           \
+  __macro(musparseSetMatIndexBase);      \
+  __macro(musparseCreateCsr);            \
+  __macro(musparseCreateCoo);            \
+  __macro(musparseCreateDnMat);          \
+  __macro(musparseCreateDnVec);          \
+  __macro(musparseSpMM);                 \
+  __macro(musparseDestroySpMat);         \
+  __macro(musparseDestroyDnMat);         \
+  __macro(musparseDestroyDnVec);         \
+  __macro(musparseSpMV);                 \
+  __macro(musparseSDDMM_bufferSize);     \
+  __macro(musparseSDDMM_preprocess);     \
+  __macro(musparseSDDMM);                \
+  __macro(musparseDnMatSetStridedBatch); \
+  __macro(musparseCooSetStridedBatch);   \
+  __macro(musparseCsrSetStridedBatch);
+
+MUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP)
+#endif  // PADDLE_WITH_MUSA
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 425d4939b565f..ff33ea379d20c 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -38,6 +38,16 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <mublas.h>
+#include <mudnn.h>
+#include <mufft.h>
+#include <murand.h>
+#include <musparse.h>
+#include <thrust/system/musa/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <miopen/miopen.h>
@@ -98,7 +108,8 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/phi/core/enforce.h"
 // Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
 #include "paddle/phi/core/flags.h"
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 9fc200ca82f1c..a5fc3786323c6 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -345,7 +345,8 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
   EXPECT_TRUE(caught_eof);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
   PADDLE_ENFORCE_GPU_SUCCESS(value);
@@ -395,6 +396,54 @@ TEST(enforce, hip_success) {
   EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Rccl error"));
 #endif
 }
+#elif defined(PADDLE_WITH_MUSA)
+TEST(enforce, musa_success) {
+  EXPECT_TRUE(CheckCudaStatusSuccess(musaSuccess));
+  EXPECT_TRUE(CheckCudaStatusFailure(musaErrorInvalidValue, "MUSA error"));
+
+  EXPECT_TRUE(CheckCudaStatusFailure(musaErrorMemoryAllocation, "MUSA error"));
+
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      musaErrorInsufficientDriver,
+      "This indicates that the installed MooreThreads MUSA driver is older "
+      "than the "
+      "MUSA runtime library. This is not a supported configuration.Users "
+      "should install an updated MooreThreads display driver to allow the "
+      "application to run"));
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      musaErrorContextIsDestroyed,
+      "This error indicates that the context current to the calling thread has "
+      "been destroyed using muCtxDestroy, or is a primary context which has "
+      "not yet been initialized"));
+
+  EXPECT_TRUE(CheckCudaStatusSuccess(MURAND_STATUS_SUCCESS));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(MURAND_STATUS_VERSION_MISMATCH, "MURAND error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(MURAND_STATUS_NOT_CREATED, "MURAND error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(MURAND_STATUS_LENGTH_NOT_MULTIPLE,
+                             "Length requested is not a multple of dimension"));
+
+  EXPECT_TRUE(CheckCudaStatusSuccess(MUBLAS_STATUS_SUCCESS));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(MUBLAS_STATUS_NOT_IMPLEMENTED, "MUBLAS error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(MUBLAS_STATUS_INVALID_VALUE, "MUBLAS error"));
+
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+  EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess));
+  EXPECT_TRUE(CheckCudaStatusFailure(mcclUnhandledMusaError, "MCCL error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(mcclSystemError, "MCCL error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(mcclInternalError,
+                                     "An internal check failed. This is either "
+                                     "a bug in MCCL or due to memory "
+                                     "corruption"));
+  EXPECT_TRUE(CheckCudaStatusFailure(mcclInvalidUsage,
+                                     "The call to MCCL is incorrect. This is "
+                                     "usually reflecting a programming error"));
+#endif
+}
 #else
 TEST(enforce, cuda_success) {
   EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index e807a54fdee2d..e1a40cb8f7f64 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -21,6 +21,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index b5f31fd85847c..bce0890daecf9 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -18,7 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -172,7 +173,8 @@ void InitDevices() {
 #endif
     /*Init all available devices by default */
     std::vector<int> devices;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     try {
       // use user specified GPUs in single-node multi-process mode.
       devices = platform::GetSelectedDevices();
@@ -215,7 +217,8 @@ void InitDevices(const std::vector<int> devices) {
       continue;
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     places.emplace_back(platform::CUDAPlace(devices[i]));
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -226,7 +229,8 @@ void InitDevices(const std::vector<int> devices) {
 #endif
   }
   places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   places.emplace_back(platform::CUDAPinnedPlace());
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -441,14 +445,15 @@ void InitMemoryMethod() {
     memory_method->allocation_deleter =
         paddle::memory::allocation::Allocator::AllocationDeleter;
 #if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \
-    defined(PADDLE_WITH_HIP)
+    defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     memory_method->copy_with_stream =
         paddle::memory::Copy<phi::Place, phi::Place>;
 #endif
     memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
     memory_method->device_memory_stat_current_value =
         paddle::memory::DeviceMemoryStatCurrentValue;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
 #endif
     memory_method->emplace_device_contexts =
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index 66fb431af29e9..3cb6ea34bdaff 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -32,7 +32,8 @@ TEST(InitDevices, CUDA) {
   using paddle::framework::InitDevices;
   using paddle::platform::DeviceContextPool;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   int count = paddle::platform::GetGPUDeviceCount();
   InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 959379260419d..d2c1a448b633c 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -57,7 +57,8 @@ typename Visitor::result_type VisitPlace(const Place &place,
                                          const Visitor &visitor) {
   switch (place.GetType()) {
     case phi::AllocationType::GPU: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       platform::CUDAPlace p(place.GetDeviceId());
       return visitor(p);
 #else
@@ -67,7 +68,8 @@ typename Visitor::result_type VisitPlace(const Place &place,
 #endif
     }
     case phi::AllocationType::GPUPINNED: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       platform::CUDAPinnedPlace p;
       return visitor(p);
 #else
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 2c65023988dc6..979219fb1920b 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -698,7 +698,8 @@ void EnableProfiler(ProfilerState state) {
   HostTraceLevel::GetInstance().SetLevel(option.trace_level);
   should_send_profile_state = true;
   phi::GetDeviceTracer()->Enable();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA ||
       phi::ProfilerHelper::g_state == ProfilerState::kAll ||
       phi::ProfilerHelper::g_state == ProfilerState::kCPU) {
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
index 5d1caffd45326..d2fea0336f012 100644
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@@ -16,6 +16,10 @@ limitations under the License. */
 #include <cuda.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -52,6 +56,20 @@ void DummyKernelAndEvent() {
       PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr));
     });
   }
+#elif defined(PADDLE_WITH_MUSA)
+  for (int i = 0; i < 5; i++) {
+    ForEachDevice([](int d) {
+      platform::SetDeviceId(d);
+      musaStream_t stream;
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream));
+      Mark("_cuda_startup_");
+      int *ptr;
+      PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&ptr, sizeof(int)));
+      DummyKernel<<<1, 1, 0, stream>>>(ptr);
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr));
+    });
+  }
 #else
   for (int i = 0; i < 5; i++) {
     ForEachDevice([](int d) {
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index c71b5a0e49104..607961ceebda3 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -31,7 +31,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
@@ -197,7 +198,8 @@ std::string OpName(const framework::VariableNameMap& name_map,
                    const std::string& type_name);
 void SetTracerOption(TracerOption option);
 platform::TracerOption GetTracerOption();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void DummyKernelAndEvent();
 #endif
 
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index e3fe83c5a74d2..4bd2be19c15bd 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -561,7 +561,8 @@ void ChromeTracingLogger::LogMetaInfo(const std::string& version,
                                        span_indx);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void ChromeTracingLogger::LogDeviceProperty(
     const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
   // add device property information
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 7f9bec1c32a53..6ad4883b89944 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -40,7 +40,8 @@ class ChromeTracingLogger : public BaseLogger {
   void LogNodeTrees(const NodeTrees&) override;
   void LogExtraInfo(const std::unordered_map<std::string, std::string>);
   void LogMemTraceEventNode(const MemTraceEventNode&) override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void LogDeviceProperty(
       const std::map<uint32_t, gpuDeviceProp>& device_property_map);
 #endif
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index 1d0970235a128..f73423d84a69b 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -129,7 +129,8 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
   // restore NodeTrees object
   std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
 // restore gpuDeviceProp
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
   for (auto indx = 0; indx < node_trees_proto_->device_property_size();
        indx++) {
@@ -155,7 +156,8 @@ DeserializationReader::~DeserializationReader() {
   input_file_stream_.close();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 gpuDeviceProp DeserializationReader::RestoreDeviceProperty(
     const DevicePropertyProto& device_property_proto) {
   gpuDeviceProp device_property;
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
index 5f99f6fd82c55..8f3f1766e126b 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -39,7 +39,8 @@ class DeserializationReader {
   MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&);
   OperatorSupplementEventNode* RestoreOperatorSupplementEventNode(
       const OperatorSupplementEventNodeProto&);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&);
 #endif
 
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index be1e1c01f8b52..9fce9e3eeecf8 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -40,7 +40,8 @@ void SerializationLogger::OpenFile() {
   node_trees_proto_ = new NodeTreesProto();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void SerializationLogger::LogDeviceProperty(
     const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
   for (auto it = device_property_map.begin(); it != device_property_map.end();
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
index 80d5413106ded..6ff84150436c7 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -37,7 +37,8 @@ class SerializationLogger : public BaseLogger {
   void LogNodeTrees(const NodeTrees&) override;
   void LogExtraInfo(const std::unordered_map<std::string, std::string>);
   void LogMemTraceEventNode(const MemTraceEventNode&) override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void LogDeviceProperty(
       const std::map<uint32_t, gpuDeviceProp>& device_property_map);
 #endif
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index eaea4f3850fef..14d81876233fd 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -137,7 +137,8 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
   return host_python_node;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 ProfilerResult::ProfilerResult(
     std::unique_ptr<NodeTrees> tree,
     const ExtraInfo& extra_info,
@@ -179,7 +180,8 @@ void ProfilerResult::Save(const std::string& file_name,
   if (format == std::string("json")) {
     ChromeTracingLogger logger(file_name);
     logger.LogMetaInfo(version_, span_indx_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     logger.LogDeviceProperty(device_property_map_);
 #endif
     tree_->LogMe(&logger);
@@ -187,7 +189,8 @@ void ProfilerResult::Save(const std::string& file_name,
   } else if (format == std::string("pb")) {
     SerializationLogger logger(file_name);
     logger.LogMetaInfo(version_, span_indx_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     logger.LogDeviceProperty(device_property_map_);
 #endif
     tree_->LogMe(&logger);
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index dae32a1902834..964fcc4c19050 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -138,7 +138,8 @@ struct HostPythonNode {
 class ProfilerResult {
  public:
   ProfilerResult() : tree_(nullptr) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   explicit ProfilerResult(
       std::unique_ptr<NodeTrees> tree,
       const ExtraInfo& extra_info,
@@ -166,7 +167,8 @@ class ProfilerResult {
 
   std::string GetVersion() { return version_; }
   uint32_t GetSpanIndx() { return span_indx_; }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> GetDeviceProperty() {
     return device_property_map_;
   }
@@ -176,7 +178,8 @@ class ProfilerResult {
   std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
   std::shared_ptr<NodeTrees> tree_;
   ExtraInfo extra_info_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map_;
 #endif
   std::string version_;
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index e0a91629a10d6..76a1b347a363f 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -18,10 +18,14 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/enforce.h"
@@ -43,6 +47,9 @@ void SynchronizeDevice() {
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #endif
+#ifdef PADDLE_WITH_MUSA
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
+#endif
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
@@ -161,7 +168,8 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
                            std::string("%s"),
                            kv.second.c_str());
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
   std::vector<int32_t> device_ids = GetSelectedDevices();
   for (auto index = 0u; index < device_ids.size(); index++) {
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
index 7fb25b25577c4..a4fb29b86f43f 100644
--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -93,6 +93,8 @@ float CalculateEstOccupancy(uint32_t DeviceId,
   return occupancy;
 }
 
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
 
 float CalculateEstOccupancy(uint32_t DeviceId,
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
index c9437e0e7793a..5adaadf87d288 100644
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -133,6 +133,8 @@ float CalculateEstOccupancy(uint32_t DeviceId,
                             int32_t BlockZ,
                             void* kernelFunc,
                             uint8_t launchType);
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
 float CalculateEstOccupancy(uint32_t deviceId,
                             uint16_t registersPerThread,
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 1d34d5fd27b3e..2e00826744091 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -31,6 +31,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif  // PADDLE_WITH_MUSA
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -103,6 +106,15 @@ void SynchronizeAllDevice() {
   }
   SetDeviceId(pre_device_id);
 #endif
+#ifdef PADDLE_WITH_MUSA
+  int pre_device_id = GetCurrentDeviceId();
+  int count = GetGPUDeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
+  }
+  SetDeviceId(pre_device_id);
+#endif
 #ifdef PADDLE_WITH_HIP
   int pre_device_id = GetCurrentDeviceId();
   int count = GetGPUDeviceCount();
@@ -142,7 +154,8 @@ void PrintMemProfiler(
             << "    Memory Profiling Report     "
             << "<-------------------------\n\n";
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   int num_gpus = GetGPUDeviceCount();
   std::cout.setf(std::ios::left);
   if (num_gpus > 0) {
@@ -344,7 +357,8 @@ void SetEvent(bool merge_thread,
     if (rit != pushed_events->rend()) {
       double event_time = 0;
       double gpu_time = 0.0f;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       gpu_time = rit->CudaElapsedMs(analyze_event);
 #endif
       double cpu_time = rit->CpuElapsedMs(analyze_event);
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 0e1c681288fe1..af59782d5c926 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -122,7 +122,8 @@ TEST(RecordEvent, RecordEvent) {
       if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
       if (events[i][j].name() == "push") {
         EXPECT_EQ(events[i][j + 1].name(), "pop");
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
 #else
         EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index c55bcb71a7d43..69a1d2c575421 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -24,11 +24,14 @@ static void StreamCallbackFunc(gpuStream_t stream,
                                gpuError_t status,
                                void *user_data)
 #endif
+#ifdef PADDLE_WITH_MUSA
+    static void MUSART_CB StreamCallbackFunc(void *user_data)
+#endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
-    static void CUDART_CB StreamCallbackFunc(void *user_data)
+        static void CUDART_CB StreamCallbackFunc(void *user_data)
 #else
-    static void CUDART_CB
+        static void CUDART_CB
     StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data)
 #endif
 #endif
@@ -58,6 +61,10 @@ void StreamCallbackManager<Stream>::AddCallback(
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
+#ifdef PADDLE_WITH_MUSA
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaLaunchHostFunc(stream_, StreamCallbackFunc, func));
+#endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
   PADDLE_ENFORCE_GPU_SUCCESS(
@@ -71,7 +78,8 @@ void StreamCallbackManager<Stream>::AddCallback(
 
 template <typename Stream>
 void StreamCallbackManager<Stream>::Wait() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || \
+    defined(PADDLE_WITH_MUSA)
   platform::GpuStreamSync(stream_);
 #endif
   {
@@ -85,6 +93,9 @@ void StreamCallbackManager<Stream>::Wait() const {
 #ifdef PADDLE_WITH_CUDA
 template struct StreamCallbackManager<gpuStream_t>;
 #endif
+#ifdef PADDLE_WITH_MUSA
+template struct StreamCallbackManager<gpuStream_t>;
+#endif
 #ifdef PADDLE_WITH_HIP
 template struct StreamCallbackManager<hipStream_t>;
 #endif
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 7cd6930a9d0d0..10b0a1aded0d9 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -21,6 +21,11 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 2b8969e1b8181..b320f96839e4c 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -24,7 +24,8 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 phi::CUDAStream *get_current_stream(int device_id) {
   if (device_id == -1) {
     device_id = phi::backends::gpu::GetCurrentDeviceId();
@@ -51,7 +52,8 @@ void BindCudaStream(py::module *m_ptr) {
   m.def(
       "_get_current_stream",
       [](int deviceId) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         return platform::get_current_stream(deviceId);
 #else
         PADDLE_THROW(
@@ -64,7 +66,8 @@ void BindCudaStream(py::module *m_ptr) {
   m.def(
       "_set_current_stream",
       [](phi::CUDAStream *stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         return platform::set_current_stream(stream);
 #else
         PADDLE_THROW(
@@ -75,7 +78,8 @@ void BindCudaStream(py::module *m_ptr) {
       py::return_value_policy::reference);
 
   m.def("_device_synchronize", [](int device_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (device_id == -1) {
       device_id = paddle::platform::GetCurrentDeviceId();
     }
@@ -84,6 +88,8 @@ void BindCudaStream(py::module *m_ptr) {
     paddle::platform::SetDeviceId(device_id);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #endif
@@ -115,7 +121,8 @@ void BindCudaStream(py::module *m_ptr) {
             s3 = paddle.device.cuda.Stream()
 
   )DOC")
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def(
           "wait_event",
           [](phi::CUDAStream &self, paddle::platform::CudaEvent &event) {
@@ -251,7 +258,8 @@ void BindCudaStream(py::module *m_ptr) {
       .def(
           "__init__",
           [](phi::CUDAStream &self, platform::CUDAPlace *place, int priority) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
             if (priority != 1 && priority != 2) {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Priority should be 1(high) or 2(normal) "));
@@ -277,7 +285,8 @@ void BindCudaStream(py::module *m_ptr) {
       .def(
           "__init__",
           [](phi::CUDAStream &self, int device, int priority) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
             if (priority != 1 && priority != 2) {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Priority should be 1(high) or 2(normal) "));
@@ -307,7 +316,8 @@ void BindCudaStream(py::module *m_ptr) {
           py::arg("device") = -1,
           py::arg("priority") = 2)
       .def("__init__", [](phi::CUDAStream &self) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
         int device_id = platform::GetCurrentDeviceId();
         auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking;
         new (&self) phi::CUDAStream(
@@ -334,7 +344,8 @@ void BindCudaStream(py::module *m_ptr) {
             event = paddle.device.cuda.Event()
 
   )DOC")
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def(
           "record",
           [](paddle::platform::CudaEvent &self, phi::CUDAStream *stream) {
@@ -398,7 +409,8 @@ void BindCudaStream(py::module *m_ptr) {
              bool enable_timing,
              bool blocking,
              bool interprocess) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
             unsigned int flags = platform::GenerateDeviceEventFlag(
                 enable_timing, blocking, interprocess);
             new (&self) paddle::platform::CudaEvent(flags);
diff --git a/paddle/fluid/pybind/cuda_streams_py.h b/paddle/fluid/pybind/cuda_streams_py.h
index d10608a6e8ea9..41e62fd92aefb 100644
--- a/paddle/fluid/pybind/cuda_streams_py.h
+++ b/paddle/fluid/pybind/cuda_streams_py.h
@@ -17,7 +17,8 @@
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #else
 namespace phi {
@@ -29,7 +30,8 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 phi::CUDAStream* get_current_stream(int device_id = -1);
 phi::CUDAStream* set_current_stream(phi::CUDAStream* stream);
 #endif
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 59a94a31c448d..d560e11da0674 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -58,7 +58,8 @@ typedef SSIZE_T ssize_t;
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #endif
 
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index 69d0465bf7cdd..72a1df8e0ace9 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -138,7 +138,8 @@ std::set<phi::DataType> _complex_dtypes{
 
 void SetDevice(paddle::platform::Place place) {
   if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     phi::backends::gpu::SetDeviceId(place.device);
     VLOG(6) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId()
             << " from " << static_cast<int>(place.device);
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index eb0e895cf575c..a911d593f76c1 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -223,13 +223,16 @@ static PyObject* tensor_method_numpy(TensorObject* self,
           sizeof_dtype * numel);
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (self->tensor.is_gpu()) {
     eager_gil_scoped_release guard;
 #if defined(PADDLE_WITH_CUDA)
     gpuMemcpyKind kind = cudaMemcpyDeviceToHost;
 #elif defined(PADDLE_WITH_HIP)
     gpuMemcpyKind kind = hipMemcpyDeviceToHost;
+#elif defined(PADDLE_WITH_MUSA)
+    gpuMemcpyKind kind = musaMemcpyDeviceToHost;
 #endif
     if (self->tensor.is_selected_rows()) {
       VLOG(6) << "Getting SelectedRows's numpy value";
@@ -1338,7 +1341,8 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
       self_numpy[_index] = py::object(py::handle(value_obj), true);
     }
     if (!self->tensor.initialized()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       SetTensorFromPyArray(self_tensor,
                            self_numpy,
                            platform::Place(platform::CUDAPlace(0)),
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 99621b1463ea9..598272ee09aff 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -40,7 +40,8 @@ void BindGenerator(py::module* m_ptr) {
            [](std::shared_ptr<phi::Generator::GeneratorState>& self) {
              return self->current_seed;
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       // NOTE(shenliang03): Due to the inability to serialize mt19937_64
       // type, resulting in a problem with precision under the cpu.
       .def(py::pickle(
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index bdf54bd76b6e1..f27a7adc62a07 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -43,7 +43,8 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -658,7 +659,8 @@ void BindPaddlePredictor(py::module *m) {
       .def("get_output_names", &PaddlePredictor::GetOutputNames)
       .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
       .def("clone", [](PaddlePredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](PaddlePredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -705,7 +707,8 @@ void BindNativePredictor(py::module *m) {
       .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
       .def("clone",
            [](NativePaddlePredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](NativePaddlePredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -750,7 +753,8 @@ void BindAnalysisConfig(py::module *m) {
       .def("exp_enable_use_cutlass", &AnalysisConfig::Exp_EnableUseCutlass)
       .def("exp_disable_mixed_precision_ops",
            &AnalysisConfig::Exp_DisableMixedPrecisionOps)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("set_exec_stream",
            [](AnalysisConfig &self, phi::CUDAStream &stream) {
              self.SetExecStream(stream.raw_stream());
@@ -1084,7 +1088,8 @@ void BindAnalysisPredictor(py::module *m) {
            &AnalysisPredictor::analysis_argument,
            py::return_value_policy::reference)
       .def("clone", [](AnalysisPredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](AnalysisPredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -1122,7 +1127,8 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("run", [](paddle_infer::Predictor &self) { self.Run(); })
       .def("clone",
            [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](paddle_infer::Predictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 9ba115381a2c0..aee4dd8b07a04 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -126,11 +126,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index c97bba9be8f2f..6c76c61542528 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -126,11 +126,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -318,7 +319,8 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudaplace
       .def("__init__",
            [](platform::CUDAPlace &self, int dev_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
              if (UNLIKELY(dev_id < 0)) {
                LOG(ERROR) << string::Sprintf(
                    "Invalid CUDAPlace(%d), device id must be 0 or "
@@ -357,7 +359,8 @@ void BindPlace(pybind11::module &m) {  // NOLINT
              std::exit(-1);
 #endif
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("get_device_id",
            [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
       .def("_type", &PlaceIndex<platform::CUDAPlace>)
@@ -372,10 +375,11 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 #endif
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 53 support float16
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     return true;
 #else
     return platform::GetGPUComputeCapability(place.device) >= 53;
@@ -383,7 +387,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   });
   m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 80 support bfloat16
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     return false;
 #else
     return platform::GetGPUComputeCapability(place.device) >= 80;
@@ -540,7 +544,8 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudapinnedplace
       .def("__init__",
            [](platform::CUDAPinnedPlace &self) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Cannot use CUDAPinnedPlace in CPU only version, "
                  "Please recompile or reinstall Paddle with CUDA support."));
diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h
index 1a6b640b3a3cf..85fde515754a5 100644
--- a/paddle/fluid/pybind/process_group_utils.h
+++ b/paddle/fluid/pybind/process_group_utils.h
@@ -250,7 +250,8 @@ void ConcatTensor(const phi::DeviceContext &dev_ctx,
 
   const auto &place = dev_ctx.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     ConcatDenseTensorWithType(static_cast<const phi::GPUContext &>(dev_ctx),
                               tensor_list,
                               dense_tensor,
@@ -307,7 +308,8 @@ void SplitTensor(const phi::DeviceContext &dev_ctx,
 
   const auto &place = dev_ctx.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     SplitDenseTensorWithType(static_cast<const phi::GPUContext &>(dev_ctx),
                              tensor,
                              &dense_list,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 60ade1f9875fd..f63330c76a5fe 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -144,11 +144,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/tensor.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -231,7 +232,8 @@ bool IsCompiledWithAVX() {
 }
 
 bool IsCompiledWithCUDA() {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
   return false;
 #else
   return true;
@@ -776,7 +778,8 @@ PYBIND11_MODULE(libpaddle, m) {
           }
         });
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   m.def("cudnn_version", &platform::DnnVersion);
   m.def("gpu_memory_available", []() {
     size_t available = 0;
@@ -828,7 +831,8 @@ PYBIND11_MODULE(libpaddle, m) {
     if (dl.device.device_type == kDLCPU) {
       paddle::framework::TensorFromDLPack(dmt, &tensor);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (dl.device.device_type == kDLGPU) {
       paddle::framework::TensorFromDLPack(dmt, &tensor);
     }
@@ -1563,7 +1567,8 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::CUDAPlace &place)
               -> paddle::platform::DeviceContext * {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -1597,7 +1602,8 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::CUDAPinnedPlace &place)
               -> paddle::platform::DeviceContext * {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPinnedPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -2199,7 +2205,8 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::take_ownership);
 
   m.def("op_support_gpu", OpSupportGPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
   m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId);
   m.def("cuda_empty_cache", [] {
@@ -2245,7 +2252,7 @@ All parameter, weight, gradient are variables in Paddle.
         return ostr.str();
       });
 
-#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32) && !defined(PADDLE_WITH_MUSA)
   m.def("nvprof_init", platform::CudaProfilerInit);
   m.def("nvprof_start", platform::CudaProfilerStart);
   m.def("nvprof_stop", platform::CudaProfilerStop);
@@ -2320,7 +2327,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("save", &paddle::platform::ProfilerResult::Save)
       .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo)
       .def("get_version", &paddle::platform::ProfilerResult::GetVersion)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx)
       .def("get_device_property",
            &paddle::platform::ProfilerResult::GetDeviceProperty);
@@ -2477,7 +2485,8 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("enable_op_info_recorder", &phi::EnableOpInfoRecorder);
   m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   m.def("set_cublas_switch", phi::SetAllowTF32Cublas);
   m.def("get_cublas_switch", phi::AllowTF32Cublas);
   m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn);
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 98ae45dd0134b..cee763c6530f1 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -126,11 +126,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index ba33fcd1d129f..c0eaa9dc3d524 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -37,7 +37,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/pybind/complex.h"
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
@@ -325,7 +326,8 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
 #endif
   } else if (platform::is_gpu_place(self.place()) ||
              platform::is_cuda_pinned_place(self.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     const T *a = self.data<T>();
     auto p = self.place();
     paddle::memory::Copy(
@@ -362,7 +364,8 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
 #endif
   } else if (platform::is_gpu_place(self->place()) ||
              platform::is_cuda_pinned_place(self->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     auto p = self->place();
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(
@@ -457,7 +460,8 @@ void SetTensorFromPyArrayT(
         "Please recompile or reinstall Paddle with CustomDevice support."));
 #endif
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (paddle::platform::is_gpu_place(place)) {
       // NOTE(wangxi): When copying data to the accelerator card,
       // we need set_device(dev_id) first.
@@ -466,6 +470,9 @@ void SetTensorFromPyArrayT(
 #ifdef PADDLE_WITH_HIP
       paddle::platform::GpuMemcpySync(
           dst, array.data(), array.nbytes(), hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      paddle::platform::GpuMemcpySync(
+          dst, array.data(), array.nbytes(), musaMemcpyHostToDevice);
 #else
       paddle::platform::GpuMemcpySync(
           dst, array.data(), array.nbytes(), cudaMemcpyHostToDevice);
@@ -790,7 +797,8 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
     output->mutable_data(place, self.dtype());
 #endif
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     if (platform::is_cuda_pinned_place(place)) {
       output->mutable_data(place, self.dtype());
     } else if ((platform::is_gpu_place(place))) {
@@ -1039,7 +1047,8 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
         "Please recompile or reinstall Paddle with XPU support."));
 #endif
   } else if (is_gpu_tensor) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
     PADDLE_ENFORCE_EQ(py_arr.writeable(),
                       true,
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 1ed3fac122826..593109d3e8e27 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -123,6 +123,9 @@ if(WITH_GPU)
 elseif(WITH_ROCM)
   hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
   target_link_libraries(phi ${PHI_DEPS})
+elseif(WITH_MUSA)
+  musa_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
+  target_link_libraries(phi ${PHI_DEPS})
 elseif(WITH_XPU_KP)
   xpu_library(
     phi ${PHI_BUILD_TYPE}
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
index 7afe17ba8419d..b694dc8013a30 100644
--- a/paddle/phi/api/include/context_pool.h
+++ b/paddle/phi/api/include/context_pool.h
@@ -97,7 +97,8 @@ namespace paddle {
  */
 PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 /**
  * Get the current CUDA stream for the passed CUDA device.
  */
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index b626df6c6701c..30f087d22c559 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -29,6 +29,11 @@ using gpuStream_t = cudaStream_t;
 using gpuStream_t = hipStream_t;
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+using gpuStream_t = musaStream_t;
+#endif
+
 #include "paddle/phi/api/include/dll_decl.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
@@ -396,7 +401,8 @@ class PADDLE_API Tensor final {
    */
   void set_impl(std::shared_ptr<phi::TensorBase>&& impl);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   /**
    * @brief Get the stream where the tensor is currently located
    * This is a deprecated method and may be removed in the future!
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index 292bd8a7e47aa..f958ea2a96039 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -19,7 +19,8 @@ limitations under the License. */
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/enforce.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -63,7 +64,8 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) {
   return const_cast<phi::Allocator*>(&dev_ctx->GetAllocator());
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) {
   PADDLE_ENFORCE_EQ(place.GetType(),
                     phi::AllocationType::GPU,
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index f9316965be26b..12c13cba89fb0 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -93,7 +93,8 @@ phi::DenseTensor CastDataType(const Context& dev_ctx,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx,
                               const phi::DenseTensor& tensor,
                               DataType dtype) {
@@ -135,7 +136,8 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
   if (tensor.place().GetType() == phi::AllocationType::CPU) {
     auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (tensor.place().GetType() == phi::AllocationType::GPU) {
     auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
@@ -153,7 +155,8 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
           << " dst_place: " << dst_place;
 
   auto& pool = phi::DeviceContextPool::Instance();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // NOTE(yy): TransDataPlace should wait for computation of input.
   if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) {
     pool.Get(tensor.place())->Wait();
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index e8caf52530868..a11dbf445ab9b 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -359,7 +359,8 @@ void Tensor::set_impl(std::shared_ptr<phi::TensorBase> &&impl) {
   impl_ = std::move(impl);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 gpuStream_t Tensor::stream() const {
   int device_id = phi::backends::gpu::GetCurrentDeviceId();
   auto *gpu_context = DeviceContextPool::Instance().Get<AllocationType::GPU>(
diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc
index b8d25e4f22b10..49e2a2698f4dd 100644
--- a/paddle/phi/api/lib/tensor_utils.cc
+++ b/paddle/phi/api/lib/tensor_utils.cc
@@ -17,9 +17,12 @@ limitations under the License. */
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
@@ -30,7 +33,8 @@ namespace paddle {
 PD_REGISTER_API(from_blob)
 
 phi::Place GetPlaceFromPtr(void* data) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
   cudaPointerAttributes attr;
@@ -43,6 +47,12 @@ phi::Place GetPlaceFromPtr(void* data) {
       phi::errors::Unimplemented("The GetPlaceFromPtr() method is only "
                                  "supported when CUDA version >= 10.0."));
 #endif
+#elif defined(PADDLE_WITH_MUSA)
+  musaPointerAttributes attr;
+  musaError_t status = musaPointerGetAttributes(&attr, data);
+  if (status == musaSuccess && attr.type == musaMemoryTypeDevice) {
+    return phi::GPUPlace(attr.device);
+  }
 #else
   hipPointerAttribute_t attr;
   hipError_t status = hipPointerGetAttributes(&attr, data);
diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h
index b19f20485227b..eaf8afbe03a65 100644
--- a/paddle/phi/api/profiler/event.h
+++ b/paddle/phi/api/profiler/event.h
@@ -27,8 +27,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -62,7 +66,8 @@ class Event {
   void set_name(std::string name) { name_ = name; }
   void set_role(EventRole role) { role_ = role; }
   std::string attr() const { return attr_; }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifndef PADDLE_WITH_CUPTI
   gpuEvent_t event() const { return event_; }
   int device() const { return device_; }
@@ -81,7 +86,8 @@ class Event {
   int64_t cpu_ns_;
   bool visited_status_{false};
   std::string attr_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUPTI
   int64_t gpu_ns_ = 0;
 
@@ -137,12 +143,15 @@ class MemEvent {
 };
 
 class CudaEvent {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
  public:
   CudaEvent() {
 #ifdef PADDLE_WITH_HIP
     hipEventCreateWithFlags(&event_, flags_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventCreateWithFlags(&event_, flags_);
 #else
     cudaEventCreateWithFlags(&event_, flags_);
 #endif
@@ -152,6 +161,8 @@ class CudaEvent {
   explicit CudaEvent(unsigned int flags) : flags_(flags) {
 #ifdef PADDLE_WITH_HIP
     hipEventCreateWithFlags(&event_, flags_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventCreateWithFlags(&event_, flags_);
 #else
     cudaEventCreateWithFlags(&event_, flags_);
 #endif
@@ -161,6 +172,8 @@ class CudaEvent {
   ~CudaEvent() {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(event_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventDestroy(event_);
 #else
     cudaEventDestroy(event_);
 #endif
@@ -169,6 +182,8 @@ class CudaEvent {
   void Record(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, stream));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
 #endif
@@ -183,6 +198,14 @@ class CudaEvent {
     if (err == hipErrorNotReady) {
       return false;
     }
+#elif defined(PADDLE_WITH_MUSA)
+    gpuError_t err = musaEventQuery(event_);
+    if (err == musaSuccess) {
+      return true;
+    }
+    if (err == musaErrorNotReady) {
+      return false;
+    }
 #else
     gpuError_t err = cudaEventQuery(event_);
     if (err == cudaSuccess) {
@@ -199,6 +222,8 @@ class CudaEvent {
   void Synchronize() {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventSynchronize(event_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
 #endif
@@ -208,6 +233,8 @@ class CudaEvent {
  private:
 #ifdef PADDLE_WITH_HIP
   unsigned int flags_ = hipEventDefault;
+#elif defined(PADDLE_WITH_MUSA)
+  unsigned int flags_ = musaEventDefault;
 #else
   unsigned int flags_ = cudaEventDefault;
 #endif
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 1c916682cf7b1..5e14c15e8cb26 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -7,7 +7,9 @@ if(NOT APPLE AND NOT WIN32)
   list(APPEND BACKENDS_SRCS device_code.cc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
        gpu/gpu_resources.cc)
   if(WITH_GPU)
@@ -16,6 +18,9 @@ if(WITH_GPU OR WITH_ROCM)
   if(WITH_ROCM)
     list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
   endif()
+  if(WITH_MUSA)
+    list(APPEND BACKENDS_SRCS gpu/musa/musa_info.cc)
+  endif()
 endif()
 
 if(WITH_XPU)
@@ -43,6 +48,7 @@ list(
 
 if(WITH_GPU
    OR WITH_ROCM
+   OR WITH_MUSA
    OR WITH_CUSTOM_DEVICE)
   list(APPEND BACKENDS_SRCS device_base.cc)
 endif()
diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
index e295ac388d892..e3b28fb2c0871 100644
--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
@@ -21,7 +21,8 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 bool allow_tf32_cublas = true;
 void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
diff --git a/paddle/phi/backends/context_pool.h b/paddle/phi/backends/context_pool.h
index 6ff90e05fed4a..966f338b7337c 100644
--- a/paddle/phi/backends/context_pool.h
+++ b/paddle/phi/backends/context_pool.h
@@ -27,7 +27,8 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void SetAllowTF32Cublas(bool active);
 /*Get the global variable allow_tf32_cublas value*/
 bool AllowTF32Cublas();
@@ -46,7 +47,8 @@ struct DefaultDeviceContextType<phi::CPUPlace> {
   using TYPE = phi::CPUContext;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <>
 struct DefaultDeviceContextType<phi::GPUPlace> {
   using TYPE = phi::GPUContext;
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index eb2934d1b4842..ac16a69aa7bee 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -78,7 +78,8 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
   }
   for (auto& p : set) {
     if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       device_codes_.emplace(p, DeviceCodeMap());
 #else
       PADDLE_THROW(phi::errors::PreconditionNotMet(
@@ -88,12 +89,14 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   GPUDeviceCode::CheckAvailableStatus();
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_HIP
 static bool CheckCUDADriverResult(hipError_t result,
                                   std::string caller,
@@ -101,6 +104,13 @@ static bool CheckCUDADriverResult(hipError_t result,
   if (result != hipSuccess) {
     const char* error = nullptr;
     error = dynload::hipGetErrorString(result);
+#elif defined(PADDLE_WITH_MUSA)
+static bool CheckCUDADriverResult(MUresult result,
+                                  std::string caller,
+                                  std::string kernel_name = "") {
+  if (result != MUSA_SUCCESS) {
+    const char* error = nullptr;
+    dynload::muGetErrorString(result, &error);
 #else
 static bool CheckCUDADriverResult(CUresult result,
                                   std::string caller,
@@ -130,6 +140,8 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   hiprtcResult nvrtc_result =
       dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
+#elif defined(PADDLE_WITH_MUSA)
+  mtrtcResult nvrtc_result = dynload::mtrtcVersion(&nvrtc_major, &nvrtc_minor);
 #else
   nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
 #endif
@@ -140,6 +152,9 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
   if (driver_result == hipSuccess) {
+#elif defined(PADDLE_WITH_MUSA)
+  MUresult driver_result = dynload::muDriverGetVersion(&driver_version);
+  if (driver_result == MUSA_SUCCESS) {
 #else
   CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
   if (driver_result == CUDA_SUCCESS) {
@@ -153,6 +168,8 @@ void GPUDeviceCode::CheckAvailableStatus() {
                        << "." << nvrtc_minor;
 #ifdef PADDLE_WITH_HIP
   if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
+#elif defined(PADDLE_WITH_MUSA)
+  if (nvrtc_result != MTRTC_SUCCESS || driver_result != MUSA_SUCCESS) {
 #else
   if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
 #endif
@@ -163,6 +180,9 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
                             "hipGetDeviceCount")) {
+#elif defined(PADDLE_WITH_MUSA)
+  if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count),
+                            "muDeviceGetCount")) {
 #else
   if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
                             "cuDeviceGetCount")) {
@@ -202,6 +222,8 @@ static std::string FindCUDAIncludePath() {
 
 #ifdef PADDLE_WITH_HIP
   cuda_include_path = "/opt/rocm/include";
+#elif defined(PADDLE_WITH_MUSA)
+  cuda_include_path = "/usr/local/musa/include";
 #else
   cuda_include_path = "/usr/local/cuda/include";
 #endif
@@ -229,6 +251,8 @@ GPUDeviceCode::GPUDeviceCode(const Place& place,
   name_ = name;
 #ifdef PADDLE_WITH_HIP
   kernel_ = "#include <hip/hip_runtime.h>\n" + kernel;
+#elif defined(PADDLE_WITH_MUSA)
+  kernel_ = kernel;
 #else
   kernel_ = kernel;
 #endif
@@ -318,6 +342,86 @@ bool GPUDeviceCode::Compile(bool include_path) {
           "hipModuleGetFunction")) {
     return false;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  mtrtcProgram program;
+  if (!CheckNVRTCResult(dynload::mtrtcCreateProgram(&program,
+                                                    kernel_.c_str(),  // buffer
+                                                    name_.c_str(),    // name
+                                                    0,         // numHeaders
+                                                    nullptr,   // headers
+                                                    nullptr),  // includeNames
+                        "mtrtcCreateProgram")) {
+    return false;
+  }
+
+  // Compile the program for specified compute_capability
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
+      DeviceContextPool::Instance().Get(place_));
+  int compute_capability = dev_ctx->GetComputeCapability();
+  std::string compute_flag =
+      "--gpu-architecture=compute_" + std::to_string(compute_capability);
+  std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
+  std::string include_option;
+  if (include_path) {
+    std::string cuda_include_path = FindCUDAIncludePath();
+    if (!cuda_include_path.empty()) {
+      include_option = "--include-path=" + cuda_include_path;
+      options.push_back(include_option.c_str());
+    }
+  }
+  mtrtcResult compile_result =
+      dynload::mtrtcCompileProgram(program,          // program
+                                   options.size(),   // numOptions
+                                   options.data());  // options
+  if (compile_result == MTRTC_ERROR_COMPILATION) {
+    // Obtain compilation log from the program
+    size_t log_size;
+    if (!CheckNVRTCResult(dynload::mtrtcGetProgramLogSize(program, &log_size),
+                          "mtrtcGetProgramLogSize")) {
+      return false;
+    }
+    std::vector<char> log;
+    log.resize(log_size + 1);
+    if (!CheckNVRTCResult(dynload::mtrtcGetProgramLog(program, log.data()),
+                          "nvrtcGetProgramLog")) {
+      return false;
+    }
+    LOG(WARNING) << "JIT compiling of MUSA code failed:"
+                 << "\n  Kernel name: " << name_ << "\n  Kernel body:\n"
+                 << kernel_ << "\n  Compiling log: " << log.data();
+
+    return false;
+  }
+
+  // Obtain PTX from the program
+  size_t ptx_size;
+  if (!CheckNVRTCResult(dynload::mtrtcGetMUSASize(program, &ptx_size),
+                        "mtrtcGetMUSASize")) {
+    return false;
+  }
+  ptx_.resize(ptx_size + 1);
+  if (!CheckNVRTCResult(dynload::mtrtcGetMUSA(program, ptx_.data()),
+                        "mtrtcGetMUSA")) {
+    return false;
+  }
+
+  if (!CheckNVRTCResult(dynload::mtrtcDestroyProgram(&program),
+                        "mtrtcDestroyProgram")) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()),
+                             "muModuleLoadData",
+                             name_)) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(
+          dynload::muModuleGetFunction(&function_, module_, name_.c_str()),
+          "muModuleGetFunction",
+          name_)) {
+    return false;
+  }
 #else
   nvrtcProgram program;
   if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
@@ -436,6 +540,22 @@ void GPUDeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
       hipSuccess,
       errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
                        name_.c_str()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_EQ(
+      dynload::muLaunchKernel(function_,
+                              num_blocks,
+                              1,
+                              1,  // grid dim
+                              num_threads_,
+                              1,
+                              1,                  // block dim
+                              0,                  // shared memory
+                              dev_ctx->stream(),  // stream
+                              args->data(),       // arguments
+                              nullptr),
+      MUSA_SUCCESS,
+      errors::External("Fail to launch kernel %s (in muLaunchKernel.)",
+                       name_.c_str()));
 #else
   PADDLE_ENFORCE_EQ(
       dynload::cuLaunchKernel(function_,
@@ -464,6 +584,18 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result,
         << " > failed: " << dynload::hiprtcGetErrorString(result);
     return false;
   }
+  return true;
+}
+#elif defined(PADDLE_WITH_MUSA)
+bool GPUDeviceCode::CheckNVRTCResult(mtrtcResult result, std::string function) {
+  if (result != MTRTC_SUCCESS) {
+    LOG_FIRST_N(WARNING, 1)
+        << "Call " << function << " for < " << name_
+        << " > failed: " << dynload::mtrtcGetErrorString(result);
+    return false;
+  }
+  return true;
+}
 #else
 bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
   if (result != NVRTC_SUCCESS) {
@@ -472,9 +604,9 @@ bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
         << " > failed: " << dynload::nvrtcGetErrorString(result);
     return false;
   }
-#endif
   return true;
 }
 #endif
+#endif
 
 }  // namespace phi
diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h
index 8debb4dc9c45e..62aea0c1c6ffb 100644
--- a/paddle/phi/backends/device_code.h
+++ b/paddle/phi/backends/device_code.h
@@ -26,6 +26,10 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/cuda_driver.h"
 #include "paddle/phi/backends/dynload/nvrtc.h"
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/musa_driver.h"
+#include "paddle/phi/backends/dynload/musartc.h"
+#endif
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hiprtc.h"
 #include "paddle/phi/backends/dynload/rocm_driver.h"
@@ -48,7 +52,8 @@ class DeviceCode {
   std::string kernel_;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 class GPUDeviceCode : public DeviceCode {
  public:
   explicit GPUDeviceCode(const Place& place,
@@ -68,6 +73,8 @@ class GPUDeviceCode : public DeviceCode {
  private:
 #ifdef PADDLE_WITH_HIP
   bool CheckNVRTCResult(hiprtcResult result, std::string function);
+#elif defined(PADDLE_WITH_MUSA)
+  bool CheckNVRTCResult(mtrtcResult result, std::string function);
 #else
   bool CheckNVRTCResult(nvrtcResult result, std::string function);
 #endif
@@ -82,6 +89,9 @@ class GPUDeviceCode : public DeviceCode {
 #ifdef PADDLE_WITH_HIP
   hipModule_t module_;
   hipFunction_t function_;
+#elif defined(PADDLE_WITH_MUSA)
+  MUmodule module_;
+  MUfunction function_;
 #else
   CUmodule module_;
   CUfunction function_;
diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h
index 8508d5206558d..1c47183f0f123 100644
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -36,7 +36,8 @@ inline size_t Alignment(size_t size,
     if (place.GetType() == phi::AllocationType::CPU) {
       alignment = phi::backends::cpu::CpuMinChunkSize();
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       alignment = phi::backends::gpu::GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
       alignment = phi::backends::xpu::XPUMinChunkSize();
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 838b623ae7b38..b57c5d096fb2c 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -30,6 +30,17 @@ if(WITH_ROCM)
     rocsparse.cc)
 endif()
 
+if(WITH_MUSA)
+  list(
+    APPEND
+    MUSA_SRCS
+    mudnn.cc
+    mublas.cc
+    musparse.cc
+    murand.cc
+    mccl.cc)
+endif()
+
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on macOS, and only do an early test on Linux and Windows.
 if(NOT APPLE)
@@ -46,6 +57,9 @@ if(NOT APPLE)
       list(APPEND HIP_SRCS cupti.cc)
     endif()
   endif()
+  if(WITH_MUSA)
+    list(APPEND MUSA_SRCS musartc.cc musa_driver.cc)
+  endif()
 endif()
 
 if(TENSORRT_FOUND)
@@ -93,6 +107,8 @@ if(WITH_ROCM)
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS})
 elseif(WITH_GPU)
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS})
+elseif(WITH_MUSA)
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${MUSA_SRCS})
 else()
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS})
 endif()
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 354ff5b7dc855..ac06fb70e57cc 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -95,6 +95,27 @@ PHI_DEFINE_string(rccl_dir,
                   "dlopen will search rccl from LD_LIBRARY_PATH");
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+
+PHI_DEFINE_string(mudnn_dir,
+                  "",
+                  "Specify path for loading libmudnn.so. For instance, "
+                  "/usr/local/musa/lib. If empty [default], dlopen "
+                  "will search libmudnn.so from LD_LIBRARY_PATH");
+
+PHI_DEFINE_string(musa_dir,
+                  "",
+                  "Specify path for loading musa library, such as libmublas, "
+                  "libmurand, libmusparse. For instance, /usr/local/musa/lib. "
+                  "If default, dlopen will search rocm from LD_LIBRARY_PATH");
+
+PHI_DEFINE_string(mccl_dir,
+                  "",
+                  "Specify path for loading mccl library, such as libmccl.so. "
+                  "For instance, /usr/local/musa/lib. If default, "
+                  "dlopen will search mccl from LD_LIBRARY_PATH");
+#endif
+
 #ifdef PADDLE_WITH_XPU
 DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so.");
 #endif
@@ -319,6 +340,8 @@ void* GetCublasDsoHandle() {
       FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmublas.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -360,6 +383,8 @@ void* GetCUDNNDsoHandle() {
       FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_mudnn_dir, "libmudnn.so", false);
 #else
   return GetDsoHandleFromSearchPath(
       FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path});
@@ -384,6 +409,8 @@ void* GetCurandDsoHandle() {
       FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmurand.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
@@ -429,6 +456,8 @@ void* GetCusparseDsoHandle() {
       FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so");
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusparse.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
 #endif
@@ -439,6 +468,8 @@ void* GetNVRTCDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusart.so", false);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
 #endif
@@ -449,6 +480,8 @@ void* GetCUDADsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusa.so", false);
 #elif defined(_WIN32)
   char system32_dir[MAX_PATH];
   GetSystemDirectory(system32_dir, MAX_PATH);
@@ -506,6 +539,10 @@ void* GetNCCLDsoHandle() {
       "You may need to install 'rccl' from ROCM official website: "
       "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
       "Installation-Guide.html before install PaddlePaddle.");
+#elif defined(PADDLE_WITH_MUSA)
+  std::string warning_msg(
+      "You may need to install 'mccl' from MUSA official website"
+      " before install PaddlePaddle.");
 #else
   std::string warning_msg(
       "You may need to install 'nccl2' from NVIDIA official website: "
@@ -519,6 +556,9 @@ void* GetNCCLDsoHandle() {
 #elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
   return GetDsoHandleFromSearchPath(
       FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
+#elif defined(PADDLE_WITH_MUSA) && defined(PADDLE_WITH_MCCL)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_mccl_dir, "libmccl.so", true, {}, warning_msg);
 #else
   return GetDsoHandleFromSearchPath(
       FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
diff --git a/paddle/phi/backends/dynload/mccl.cc b/paddle/phi/backends/dynload/mccl.cc
new file mode 100644
index 0000000000000..d6f0208780de8
--- /dev/null
+++ b/paddle/phi/backends/dynload/mccl.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/mccl.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag mccl_dso_flag;
+void *mccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mccl.h b/paddle/phi/backends/dynload/mccl.h
new file mode 100644
index 0000000000000..19ab0246f99d7
--- /dev/null
+++ b/paddle/phi/backends/dynload/mccl.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <mccl.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag mccl_dso_flag;
+extern void* mccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using mccl_func = decltype(&::__name);                     \
+      std::call_once(mccl_dso_flag, []() {                       \
+        mccl_dso_handle = phi::dynload::GetNCCLDsoHandle();      \
+      });                                                        \
+      static void* p_##__name = dlsym(mccl_dso_handle, #__name); \
+      return reinterpret_cast<mccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#define MCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(mcclCommInitAll);             \
+  __macro(mcclGetUniqueId);             \
+  __macro(mcclCommInitRank);            \
+  __macro(mcclCommDestroy);             \
+  __macro(mcclCommCount);               \
+  __macro(mcclCommCuDevice);            \
+  __macro(mcclCommUserRank);            \
+  __macro(mcclAllReduce);               \
+  __macro(mcclBcast);                   \
+  __macro(mcclAllGather);               \
+  __macro(mcclGroupStart);              \
+  __macro(mcclGroupEnd);                \
+  __macro(mcclReduce);                  \
+  __macro(mcclReduceScatter);           \
+  __macro(mcclGetErrorString);          \
+  __macro(mcclBroadcast);               \
+  __macro(mcclGetVersion);              \
+  __macro(mcclSend);                    \
+  __macro(mcclRecv);                    \
+  __macro(mcclRedOpCreatePreMulSum);    \
+  __macro(mcclRedOpDestroy);
+
+MCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mublas.cc b/paddle/phi/backends/dynload/mublas.cc
new file mode 100644
index 0000000000000..72c0e9954311e
--- /dev/null
+++ b/paddle/phi/backends/dynload/mublas.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/mublas.h"
+
+namespace phi {
+namespace dynload {
+std::once_flag mublas_dso_flag;
+void *mublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mublas.h b/paddle/phi/backends/dynload/mublas.h
new file mode 100644
index 0000000000000..3b91a703f5775
--- /dev/null
+++ b/paddle/phi/backends/dynload/mublas.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mublas.h>
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag mublas_dso_flag;
+extern void *mublas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using mublas_func =                                                   \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(mublas_dso_flag, []() {                                \
+        mublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
+      });                                                                   \
+      static void *p_##__name = dlsym(mublas_dso_handle, #__name);          \
+      return reinterpret_cast<mublas_func>(p_##__name)(args...);            \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+#define MUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(mublasSaxpy);                   \
+  __macro(mublasDaxpy);                   \
+  __macro(mublasCaxpy);                   \
+  __macro(mublasZaxpy);                   \
+  __macro(mublasSscal);                   \
+  __macro(mublasDscal);                   \
+  __macro(mublasScopy);                   \
+  __macro(mublasDcopy);                   \
+  __macro(mublasSgemv);                   \
+  __macro(mublasDgemv);                   \
+  __macro(mublasCgemv);                   \
+  __macro(mublasZgemv);                   \
+  __macro(mublasSgemm);                   \
+  __macro(mublasDgemm);                   \
+  __macro(mublasCgemm);                   \
+  __macro(mublasZgemm);                   \
+  __macro(mublasHgemm);                   \
+  __macro(mublasSgeam);                   \
+  __macro(mublasDgeam);                   \
+  __macro(mublasDtrsm);                   \
+  __macro(mublasCtrsm);                   \
+  __macro(mublasZtrsm);                   \
+  __macro(mublasCreate);                  \
+  __macro(mublasDestroy);                 \
+  __macro(mublasSetStream);               \
+  __macro(mublasSetPointerMode);          \
+  __macro(mublasGetPointerMode);          \
+  __macro(mublasSgemmBatched);            \
+  __macro(mublasDgemmBatched);            \
+  __macro(mublasCgemmBatched);            \
+  __macro(mublasZgemmBatched);
+
+MUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+#undef DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mudnn.cc b/paddle/phi/backends/dynload/mudnn.cc
new file mode 100644
index 0000000000000..87b51cb8bb56a
--- /dev/null
+++ b/paddle/phi/backends/dynload/mudnn.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/mudnn.h"
+
+namespace phi {
+namespace dynload {
+
+bool HasCUDNN() {
+  // note: mudnn.so is not imported by dlopen, which will be linked
+  // in cmakelist.txt.
+  return true;
+}
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mudnn.h b/paddle/phi/backends/dynload/mudnn.h
new file mode 100644
index 0000000000000..66ba6a21b28cf
--- /dev/null
+++ b/paddle/phi/backends/dynload/mudnn.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_MUSA
+
+namespace phi {
+namespace dynload {
+
+extern bool HasCUDNN();
+
+}  // namespace dynload
+}  // namespace phi
+#endif
diff --git a/paddle/phi/backends/dynload/murand.cc b/paddle/phi/backends/dynload/murand.cc
new file mode 100644
index 0000000000000..bd88319b0d524
--- /dev/null
+++ b/paddle/phi/backends/dynload/murand.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/murand.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag murand_dso_flag;
+void *murand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/murand.h b/paddle/phi/backends/dynload/murand.h
new file mode 100644
index 0000000000000..64aa082b5a1b8
--- /dev/null
+++ b/paddle/phi/backends/dynload/murand.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <murand.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+extern std::once_flag murand_dso_flag;
+extern void *murand_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_MURAND_WRAP(__name)                   \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    murandStatus_t operator()(Args... args) {                      \
+      using murandFunc = decltype(&::__name);                      \
+      std::call_once(murand_dso_flag, []() {                       \
+        murand_dso_handle = phi::dynload::GetCurandDsoHandle();    \
+      });                                                          \
+      static void *p_##__name = dlsym(murand_dso_handle, #__name); \
+      return reinterpret_cast<murandFunc>(p_##__name)(args...);    \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+
+#define MURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(murandCreateGenerator);              \
+  __macro(murandSetStream);                    \
+  __macro(murandSetPseudoRandomGeneratorSeed); \
+  __macro(murandGenerateUniform);              \
+  __macro(murandGenerateUniformDouble);        \
+  __macro(murandGenerateNormal);               \
+  __macro(murandDestroyGenerator);
+
+MURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musa_driver.cc b/paddle/phi/backends/dynload/musa_driver.cc
new file mode 100644
index 0000000000000..2173a8d6cdd81
--- /dev/null
+++ b/paddle/phi/backends/dynload/musa_driver.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/musa_driver.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag musa_dso_flag;
+void* musa_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSA_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUDADriver() {
+  std::call_once(musa_dso_flag, []() { musa_dso_handle = GetCUDADsoHandle(); });
+  return musa_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h
new file mode 100644
index 0000000000000..3534ab8213c93
--- /dev/null
+++ b/paddle/phi/backends/dynload/musa_driver.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag musa_dso_flag;
+extern void* musa_dso_handle;
+extern bool HasCUDADriver();
+
+#define DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name)                       \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using musa_func = decltype(&::__name);                         \
+      std::call_once(musa_dso_flag, []() {                           \
+        musa_dso_handle = phi::dynload::GetCUDADsoHandle();          \
+      });                                                            \
+      static void* p_##__name = dlsym(musa_dso_handle, #__name);     \
+      return reinterpret_cast<musa_func>(p_##__name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed musa driver functions
+ **/
+#define MUSA_ROUTINE_EACH(__macro)                      \
+  __macro(muInit);                                      \
+  __macro(muDriverGetVersion);                          \
+  __macro(muGetErrorString);                            \
+  __macro(muModuleLoadData);                            \
+  __macro(muModuleGetFunction);                         \
+  __macro(muModuleUnload);                              \
+  __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \
+  __macro(muLaunchKernel);                              \
+  __macro(muCtxCreate);                                 \
+  __macro(muCtxGetCurrent);                             \
+  __macro(muDeviceGetCount);                            \
+  __macro(muDevicePrimaryCtxGetState);                  \
+  __macro(muDeviceGetAttribute);                        \
+  __macro(muDeviceGet);
+
+MUSA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_MUSA_WRAP
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musartc.cc b/paddle/phi/backends/dynload/musartc.cc
new file mode 100644
index 0000000000000..9cd25270a1016
--- /dev/null
+++ b/paddle/phi/backends/dynload/musartc.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/musartc.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag musartc_dso_flag;
+void* musartc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSARTC_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasNVRTC() {
+  std::call_once(musartc_dso_flag,
+                 []() { musartc_dso_handle = GetNVRTCDsoHandle(); });
+  return musartc_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musartc.h b/paddle/phi/backends/dynload/musartc.h
new file mode 100644
index 0000000000000..317621090a5b3
--- /dev/null
+++ b/paddle/phi/backends/dynload/musartc.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mtrtc.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag musartc_dso_flag;
+extern void* musartc_dso_handle;
+extern bool HasNVRTC();
+
+#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using musartc_func = decltype(&::__name);                      \
+      std::call_once(musartc_dso_flag, []() {                        \
+        musartc_dso_handle = phi::dynload::GetNVRTCDsoHandle();      \
+      });                                                            \
+      static void* p_##__name = dlsym(musartc_dso_handle, #__name);  \
+      return reinterpret_cast<musartc_func>(p_##__name)(args...);    \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed musartc functions
+ **/
+#define MUSARTC_ROUTINE_EACH(__macro) \
+  __macro(mtrtcVersion);              \
+  __macro(mtrtcGetErrorString);       \
+  __macro(mtrtcCompileProgram);       \
+  __macro(mtrtcCreateProgram);        \
+  __macro(mtrtcDestroyProgram);       \
+  __macro(mtrtcGetMUSA);              \
+  __macro(mtrtcGetMUSASize);          \
+  __macro(mtrtcGetProgramLog);        \
+  __macro(mtrtcGetProgramLogSize)
+
+MUSARTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musparse.cc b/paddle/phi/backends/dynload/musparse.cc
new file mode 100644
index 0000000000000..35ccd602e63ba
--- /dev/null
+++ b/paddle/phi/backends/dynload/musparse.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/musparse.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag musparse_dso_flag;
+void *musparse_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musparse.h b/paddle/phi/backends/dynload/musparse.h
new file mode 100644
index 0000000000000..595e6d490d5e4
--- /dev/null
+++ b/paddle/phi/backends/dynload/musparse.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <musa.h>
+#include <musparse.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+extern std::once_flag musparse_dso_flag;
+extern void *musparse_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name)                   \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    musparseStatus_t operator()(Args... args) {                      \
+      using Func = decltype(&::__name);                              \
+      std::call_once(musparse_dso_flag, []() {                       \
+        musparse_dso_handle = phi::dynload::GetCusparseDsoHandle();  \
+      });                                                            \
+      static void *p_##__name = dlsym(musparse_dso_handle, #__name); \
+      return reinterpret_cast<Func>(p_##__name)(args...);            \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#if defined(PADDLE_WITH_MUSA)
+#define MUSPARSE_ROUTINE_EACH(__macro)   \
+  __macro(musparseSetStream);            \
+  __macro(musparseCreateMatDescr);       \
+  __macro(musparseSnnz);                 \
+  __macro(musparseDnnz);                 \
+  __macro(musparseSetMatType);           \
+  __macro(musparseSetMatIndexBase);      \
+  __macro(musparseCreateCsr);            \
+  __macro(musparseCreateCoo);            \
+  __macro(musparseCreateDnMat);          \
+  __macro(musparseCreateDnVec);          \
+  __macro(musparseSpMM);                 \
+  __macro(musparseDestroySpMat);         \
+  __macro(musparseDestroyDnMat);         \
+  __macro(musparseDestroyDnVec);         \
+  __macro(musparseSpMV);                 \
+  __macro(musparseSDDMM_bufferSize);     \
+  __macro(musparseSDDMM_preprocess);     \
+  __macro(musparseSDDMM);                \
+  __macro(musparseDnMatSetStridedBatch); \
+  __macro(musparseCooSetStridedBatch);   \
+  __macro(musparseCsrSetStridedBatch);
+
+MUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP)
+
+#endif  // PADDLE_WITH_MUSA
+
+#undef DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index e1f3492f76870..4437f9c315ff0 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -72,6 +72,13 @@ using cufftHandle = int;
 // Forward declaration of NCCL types.
 using ncclComm_t = struct ncclComm *;
 
+// Forward declaration of MUSA runtime types.
+using musaStream_t = struct MUstream_st *;
+using musaEvent_t = struct MUevent_st *;
+using mublasHandle_t = struct _mublasHandle_t *;
+using mudnnHandle_t = class Handle *;
+using musparseHandle_t = struct _musparse_handle *;
+
 /// Forward declaration of ROCM types.
 #include <cstddef>
 
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 5c9c010d365e4..615ab755f8a78 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -43,6 +43,15 @@ limitations under the License. */
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
+#include "paddle/phi/backends/dynload/musparse.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#include "paddle/phi/backends/dynload/mccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/backends/dynload/rocblas.h"
@@ -119,6 +128,9 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));
@@ -143,11 +155,22 @@ static void StreamCallbackFunc(gpuStream_t stream,
                                gpuError_t status,
                                void* user_data)
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#if MUSA_VERSION >= 10000
+    static void StreamCallbackFunc(void* user_data)
+#else
+    static void StreamCallbackFunc(cudaStream_t stream,
+                                   cudaError_t status,
+                                   void* user_data)
+#endif
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
-    static void CUDART_CB StreamCallbackFunc(void* user_data)
+        static void CUDART_CB StreamCallbackFunc(void* user_data)
 #else
-    static void CUDART_CB
+        static void CUDART_CB
     StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data)
 #endif
 #endif
@@ -170,6 +193,8 @@ void DnnWorkspaceHandle::RunFuncSync(
     std::lock_guard<std::mutex> guard(*mtx_);
 #ifdef PADDLE_WITH_HIP
     auto status = hipMalloc(&workspace_ptr, size);
+#elif defined(PADDLE_WITH_MUSA)
+    auto status = musaMalloc(&workspace_ptr, size);
 #else
     auto status = cudaMalloc(&workspace_ptr, size);
 #endif
@@ -178,6 +203,8 @@ void DnnWorkspaceHandle::RunFuncSync(
       phi::backends::gpu::GpuStreamSync(stream_);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaFree(workspace_ptr));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr));
 #endif
@@ -248,7 +275,9 @@ struct GPUContext::Impl {
       DestoryInternalWorkspace();
       DestoryInternalEigenDevice();
       phi::DestroySparseHandle(sparse_handle_);
+#ifndef PADDLE_WITH_MUSA
       phi::DestroySolverHandle(solver_handle_);
+#endif
       phi::DestroyDnnHandle(dnn_handle_);
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       if (nccl_comm_) {
@@ -264,7 +293,9 @@ struct GPUContext::Impl {
       phi::DestroyBlasHandle(blas_handle_);
       phi::DestroyBlasHandle(blas_tensor_core_handle_);
       phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
+#ifndef PADDLE_WITH_MUSA
       phi::DestroyBlasLtHandle(blaslt_handle_);
+#endif
     }
     if (stream_owned_ && stream_) {
       delete stream_;
@@ -425,6 +456,7 @@ struct GPUContext::Impl {
     blas_tf32_tensor_core_handle_creator_ = std::move(handle_creator);
   }
 
+#ifndef PADDLE_WITH_MUSA
   void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
 
   void SetBlasLtHandle(std::function<blasLtHandle_t()>&& handle_creator) {
@@ -443,6 +475,7 @@ struct GPUContext::Impl {
     PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
     return blaslt_handle_;
   }
+#endif
 
   dnnHandle_t GetDnnHandle() {
     std::call_once(flag_dnn_, [&]() {
@@ -464,7 +497,7 @@ struct GPUContext::Impl {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(dnn_handle_));
       dnn_handle_ = nullptr;
     }
-#else
+#elif defined(PADDLE_WITH_CUDA)
     if (owned_ && dnn_handle_ != nullptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_));
       dnn_handle_ = nullptr;
@@ -478,6 +511,7 @@ struct GPUContext::Impl {
     dnn_handle_creator_ = std::move(handle_creator);
   }
 
+#ifndef PADDLE_WITH_MUSA
   solverHandle_t GetSolverHandle() {
     std::call_once(flag_slover_, [&]() {
       if (!solver_handle_) {
@@ -497,6 +531,7 @@ struct GPUContext::Impl {
   void SetSolverHandle(std::function<solverHandle_t()>&& handle_creator) {
     solver_handle_creator_ = std::move(handle_creator);
   }
+#endif
 
   sparseHandle_t GetSparseHandle() {
     std::call_once(flag_sparse_, [&]() {
@@ -529,7 +564,19 @@ struct GPUContext::Impl {
       break;
     }
 #endif  // !defined(_WIN32)
-#else   // PADDLE_WITH_HIP
+
+#elif defined(PADDLE_WITH_MUSA)
+    musaError_t e_sync = musaSuccess;
+#if !defined(_WIN32)
+    e_sync = musaStreamSynchronize(stream());
+#else
+    while (e_sync = musaStreamQuery(stream())) {
+      if (e_sync == musaErrorNotReady) continue;
+      break;
+    }
+#endif  // !defined(_WIN32)
+
+#else  // PADDLE_WITH_MUSA
     cudaError_t e_sync = cudaSuccess;
 #if !defined(_WIN32)
     e_sync = cudaStreamSynchronize(stream());
@@ -539,7 +586,7 @@ struct GPUContext::Impl {
       break;
     }
 #endif  // !defined(_WIN32)
-#endif  // PADDLE_WITH_HIP
+#endif  // PADDLE_WITH_CUDA
 
     PADDLE_ENFORCE_GPU_SUCCESS(e_sync);
   }
@@ -547,6 +594,8 @@ struct GPUContext::Impl {
   void WaitEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream(), ev, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream(), ev, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream(), ev, 0));
 #endif
@@ -678,6 +727,8 @@ struct GPUContext::Impl {
   void RecordEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream()));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(ev, stream()));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream()));
 #endif
@@ -708,11 +759,17 @@ struct GPUContext::Impl {
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0));
 #endif
+#endif
+
+#ifdef PADDLE_WITH_MUSA
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaLaunchHostFunc(stream(), internal::StreamCallbackFunc, func));
 #endif
   }
 
   void WaitStreamCallback() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) || \
+    defined(PADDLE_WITH_MUSA)
     phi::backends::gpu::GpuStreamSync(stream());
 #endif
     {
@@ -764,12 +821,16 @@ struct GPUContext::Impl {
   std::function<blasHandle_t()> blas_tensor_core_handle_creator_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
   std::function<blasHandle_t()> blas_tf32_tensor_core_handle_creator_{nullptr};
+#ifndef PADDLE_WITH_MUSA
   blasLtHandle_t blaslt_handle_{nullptr};
   std::function<blasLtHandle_t()> blaslt_handle_creator_{nullptr};
+#endif
   dnnHandle_t dnn_handle_{nullptr};
   std::function<dnnHandle_t()> dnn_handle_creator_{nullptr};
+#ifndef PADDLE_WITH_MUSA
   solverHandle_t solver_handle_{nullptr};
   std::function<solverHandle_t()> solver_handle_creator_{nullptr};
+#endif
   sparseHandle_t sparse_handle_{nullptr};
   std::function<sparseHandle_t()> sparse_handle_creator_{nullptr};
   DnnWorkspaceHandle* workspace_{nullptr};
@@ -839,6 +900,7 @@ blasHandle_t GPUContext::cublas_handle() const {
   return impl_->GetBlasHandle();
 }
 
+#ifndef PADDLE_WITH_MUSA
 blasLtHandle_t GPUContext::cublaslt_handle() const {
   return impl_->GetBlasLtHandle();
 }
@@ -846,6 +908,7 @@ blasLtHandle_t GPUContext::cublaslt_handle() const {
 solverHandle_t GPUContext::cusolver_dn_handle() const {
   return impl_->GetSolverHandle();
 }
+#endif
 
 sparseHandle_t GPUContext::cusparse_handle() const {
   return impl_->GetSparseHandle();
@@ -965,6 +1028,7 @@ void GPUContext::SetBlasTF32Handle(std::function<blasHandle_t()>&& func) {
   impl_->SetBlasTF32Handle(std::move(func));
 }
 
+#ifndef PADDLE_WITH_MUSA
 void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
   impl_->SetBlasLtHandle(blaslt);
 }
@@ -972,6 +1036,7 @@ void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
 void GPUContext::SetBlasLtHandle(std::function<blasLtHandle_t()>&& func) {
   impl_->SetBlasLtHandle(std::move(func));
 }
+#endif
 
 void GPUContext::SetDnnHandle(dnnHandle_t handle) {
   impl_->SetDnnHandle(handle);
@@ -981,6 +1046,7 @@ void GPUContext::SetDnnHandle(std::function<dnnHandle_t()>&& func) {
   impl_->SetDnnHandle(std::move(func));
 }
 
+#ifndef PADDLE_WITH_MUSA
 void GPUContext::SetSolverHandle(solverHandle_t handle) {
   impl_->SetSolverHandle(handle);
 }
@@ -988,6 +1054,7 @@ void GPUContext::SetSolverHandle(solverHandle_t handle) {
 void GPUContext::SetSolverHandle(std::function<solverHandle_t()>&& func) {
   impl_->SetSolverHandle(std::move(func));
 }
+#endif
 
 void GPUContext::SetSparseHandle(sparseHandle_t handle) {
   impl_->SetSparseHandle(handle);
@@ -1046,7 +1113,8 @@ void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) {
 
 void GPUContext::ClearDnnAttr() { return impl_->ClearDnnAttr(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 GPUPinnedContext::GPUPinnedContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index b4a3974378241..ce92612304cda 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #pragma once
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU_KP)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU_KP)
 
 #include <array>
 #include <functional>
@@ -108,11 +108,13 @@ class PADDLE_API GPUContext : public DeviceContext,
   /*! \brief  Return cublas handle in the device context. */
   blasHandle_t cublas_handle() const;
 
+#ifndef PADDLE_WITH_MUSA
   /*! \brief  Return cublasLt handle in the device context. */
   blasLtHandle_t cublaslt_handle() const;
 
   /*! \brief  Return cusolver handle in the device context. */
   solverHandle_t cusolver_dn_handle() const;
+#endif
 
   /*! \brief  Return cusparse handle in the device context. */
   sparseHandle_t cusparse_handle() const;
@@ -232,14 +234,18 @@ class PADDLE_API GPUContext : public DeviceContext,
   void SetBlasTF32Handle(blasHandle_t);
   void SetBlasTF32Handle(std::function<blasHandle_t()>&&);
 
+#ifndef PADDLE_WITH_MUSA
   void SetBlasLtHandle(blasLtHandle_t);
   void SetBlasLtHandle(std::function<blasLtHandle_t()>&&);
+#endif
 
   void SetDnnHandle(dnnHandle_t);
   void SetDnnHandle(std::function<dnnHandle_t()>&&);
 
+#ifndef PADDLE_WITH_MUSA
   void SetSolverHandle(solverHandle_t);
   void SetSolverHandle(std::function<solverHandle_t()>&&);
+#endif
 
   void SetSparseHandle(sparseHandle_t);
   void SetSparseHandle(std::function<sparseHandle_t()>&&);
@@ -276,7 +282,8 @@ using GPUDNNContext = GPUContext;
 // because we want to implement a KPS-based kernel and make it run
 // on GPU and XPU at the same time, so we need KPSContext when registering
 // KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 using KPSContext = GPUContext;
 #endif
 
@@ -287,7 +294,8 @@ struct DefaultDevice;
 }  // namespace Eigen
 
 namespace phi {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 // Currently, GPUPinnedContext is only used to data copying.
 class GPUPinnedContext
     : public DeviceContext,
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 4a6b9d2fd87f1..65ec52c5476d0 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -20,18 +20,39 @@
 namespace phi {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
 
-#else  // PADDLE_WITH_CDUA
-
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+  using GPU_TYPE = MUSA_TYPE;
+#else
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
-#endif
+#endif  // PADDLE_WITH_CDUA
 
-DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
-DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
+DECLARE_TYPE_FOR_GPU(sparseHandle_t,
+                     cusparseHandle_t,
+                     rocsparse_handle,
+                     musparseHandle_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t, mudnnHandle_t);
+DECLARE_TYPE_FOR_GPU(blasHandle_t,
+                     cublasHandle_t,
+                     rocblas_handle,
+                     mublasHandle_t);
+#undef DECLARE_TYPE_FOR_GPU
+
+#ifndef PADDLE_WITH_MUSA
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
 
+#else
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif  // PADDLE_WITH_CDUA
 DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
                      cudnnActivationStruct,
                      miopenActivationDescriptor);
@@ -56,19 +77,13 @@ DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
 DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
                      cudnnDropoutDescriptor_t,
                      miopenDropoutDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
-
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
 DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
-
 DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
-
-DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
-
 #undef DECLARE_TYPE_FOR_GPU
+#endif
 
 using CUDAGraphID = unsigned long long;  // NOLINT
 
diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h
index 0f79e2a645ab3..a5728c25012f9 100644
--- a/paddle/phi/backends/gpu/gpu_device_function.h
+++ b/paddle/phi/backends/gpu/gpu_device_function.h
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/gpu/musa/musa_device_function.h"
 #else
 #include "paddle/phi/backends/gpu/cuda/cuda_device_function.h"
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h
index f37afa3deeb74..b67010344a64e 100644
--- a/paddle/phi/backends/gpu/gpu_dnn.h
+++ b/paddle/phi/backends/gpu/gpu_dnn.h
@@ -14,11 +14,14 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_desc.h"
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+
 #else  // CUDA
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h
index 2353b42794ffd..456681bb2b5d6 100644
--- a/paddle/phi/backends/gpu/gpu_helper.h
+++ b/paddle/phi/backends/gpu/gpu_helper.h
@@ -13,10 +13,13 @@
 // limitations under the License.
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/gpu/musa/musa_helper.h"
 #else
 #include "paddle/phi/backends/gpu/cuda/cuda_helper.h"
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index ebf57bd06eb19..70b4ebd21294e 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -11,7 +11,8 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #include <stddef.h>
 
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index a7a7ad03ad664..5080a714bebb3 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -16,10 +16,13 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index a77527c081650..d46ada073c47d 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -16,6 +16,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -58,7 +61,8 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
       static_cast<unsigned long long int>(val));            // NOLINT
 }
 
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
+#if defined(__HIPCC__) || defined(__MUSACC__) || \
+    (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index a447df94cb4dc..2bee37b300258 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -33,6 +33,15 @@
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
+#include "paddle/phi/backends/dynload/musparse.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#include "paddle/phi/backends/dynload/mccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/rocsparse.h"
 #endif
@@ -64,10 +73,9 @@ void InitGpuProperties(Place place,
   *driver_version = backends::gpu::GetGPUDriverVersion(place.GetDeviceId());
   *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId());
 
+#ifdef PADDLE_WITH_CUDA
   const gpuDeviceProp& prop =
       backends::gpu::GetDeviceProperties(place.GetDeviceId());
-
-#ifdef PADDLE_WITH_CUDA
   static const std::set<int> compiled_archs{CUDA_REAL_ARCHS};
   // Make sure compiled cuda arch is as same as runtime cuda arch.
   if (compiled_archs.find(*compute_capability) == compiled_archs.cend() &&
@@ -144,6 +152,47 @@ void InitGpuProperties(Place place,
         << "Please recompile or reinstall Paddle with compatible MIOPEN "
            "version.";
   }
+#elif defined(PADDLE_WITH_MUSA)
+  // TODO(@caizhi): mudnnGetVersion is not supported for MUSA now.
+  // Requests have been submitted to Mudnn.
+  // size_t mudnn_dso_ver = dynload::mudnnGetVersion();
+  size_t mudnn_dso_ver = 1100;
+  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
+                          << ", muDNN Version: " << mudnn_dso_ver / 1000 << "."
+                          << (mudnn_dso_ver % 1000) / 100 << ".";
+
+  // Check MUSA/MUDNN version compatiblity
+  auto local_musa_version =
+      (*driver_version / 1000) * 10 + (*driver_version % 100) / 10;
+  auto compile_musa_version =
+      (MUSA_VERSION / 1000) * 10 + (MUSA_VERSION % 100) / 10;
+#if defined(__linux__)
+  PADDLE_ENFORCE_EQ(
+      (local_musa_version / 10 < compile_musa_version / 10) &&
+          (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000),
+      false,
+      phi::errors::InvalidArgument(
+          "The installed Paddle is compiled with MUDA%d/muDNN%d,"
+          "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. "
+          "which will cause serious incompatible bug. "
+          "Please recompile or reinstall Paddle with compatible MUSA/muDNN "
+          "version.",
+          compile_musa_version / 10,
+          MUDNN_VERSION / 1000,
+          local_musa_version / 10,
+          mudnn_dso_ver / 1000));
+#endif
+  if (local_musa_version < compile_musa_version) {
+    LOG_FIRST_N(WARNING, 1)
+        << "WARNING: device: " << static_cast<int>(place.device)
+        << ". The installed Paddle is compiled with MUSA "
+        << compile_musa_version / 10 << "." << compile_musa_version % 10
+        << ", but MUSA runtime version in your machine is "
+        << local_musa_version / 10 << "." << local_musa_version % 10
+        << ", which may cause serious incompatible bug. "
+        << "Please recompile or reinstall Paddle with compatible MUSA "
+           "version.";
+  }
 #else
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
   LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
@@ -189,6 +238,9 @@ void InitStream(gpuStream_t* stream) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipStreamCreateWithPriority(stream, hipStreamDefault, 0));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaStreamCreateWithPriority(stream, musaStreamDefault, 0));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0));
@@ -199,6 +251,8 @@ void DestoryStream(gpuStream_t stream) {
   if (stream != nullptr) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
 #endif
@@ -210,7 +264,11 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
   phi::dynload::rocblas_create_handle(blas_handle);
   phi::dynload::rocblas_set_stream(*blas_handle, stream);
-#else   // PADDLE_WITH_CUDA
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::mublasCreate(blas_handle));
+  PADDLE_RETRY_CUDA_SUCCESS(
+      phi::dynload::mublasSetStream(*blas_handle, stream));
+#else   // PADDLE_WITH_MUSA
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
   PADDLE_RETRY_CUDA_SUCCESS(
       phi::dynload::cublasSetStream(*blas_handle, stream));
@@ -223,6 +281,11 @@ void DestroyBlasHandle(blasHandle_t handle) {
     phi::dynload::rocblas_destroy_handle(handle);
     handle = nullptr;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  if (handle != nullptr) {
+    phi::dynload::mublasDestroy(handle);
+    handle = nullptr;
+  }
 #else
   if (handle != nullptr) {
     phi::dynload::cublasDestroy(handle);
@@ -231,6 +294,7 @@ void DestroyBlasHandle(blasHandle_t handle) {
 #endif  // PADDLE_WITH_HIP
 }
 
+#ifndef PADDLE_WITH_MUSA
 void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
   phi::dynload::cublasLtCreate(blaslt_handle);
@@ -245,6 +309,7 @@ void DestroyBlasLtHandle(blasLtHandle_t handle) {
   }
 #endif
 }
+#endif
 
 void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
   if (phi::dynload::HasCUDNN()) {
@@ -268,7 +333,7 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
     }
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
     auto compile_cudnn_version = CUDNN_VERSION / 100;
     if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
@@ -296,6 +361,12 @@ void DestroyDnnHandle(dnnHandle_t handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle));
     handle = nullptr;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  if (handle != nullptr) {
+    // TODO(@caizhi): enable dynload module
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDestroy(handle));
+    handle = nullptr;
+  }
 #else
   if (handle != nullptr) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle));
@@ -304,21 +375,23 @@ void DestroyDnnHandle(dnnHandle_t handle) {
 #endif  // PADDLE_WITH_HIP
 }
 
+#ifndef PADDLE_WITH_MUSA
 void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
 #endif
 }
 
 void DestroySolverHandle(solverHandle_t solver_handle) {
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_CUDA
   if (solver_handle != nullptr) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle));
     solver_handle = nullptr;
   }
 #endif
 }
+#endif
 
 void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
 // ROCM is not yet supported
diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h
index 7bec5eebf5886..16d63910b8f4a 100644
--- a/paddle/phi/backends/gpu/gpu_resources.h
+++ b/paddle/phi/backends/gpu/gpu_resources.h
@@ -35,14 +35,18 @@ void DestoryStream(gpuStream_t stream);
 void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream);
 void DestroyBlasHandle(blasHandle_t handle);
 
+#ifndef PADDLE_WITH_MUSA
 void InitBlasLtHandle(blasLtHandle_t* blaslt_handle);
 void DestroyBlasLtHandle(blasLtHandle_t handle);
+#endif
 
 void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place);
 void DestroyDnnHandle(dnnHandle_t handle);
 
+#ifndef PADDLE_WITH_MUSA
 void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream);
 void DestroySolverHandle(solverHandle_t solver_handle);
+#endif
 
 void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream);
 void DestroySparseHandle(sparseHandle_t handle);
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index 77f403795b6b3..00c0bdf6c545b 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -17,11 +17,15 @@
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/backends/dynload/rocblas.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
 #else  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
@@ -30,18 +34,39 @@
 namespace phi {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
 
-#else  // PADDLE_WITH_CDUA
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+  using GPU_TYPE = MUSA_TYPE;
+
+#else  // PADDLE_WITH_MUSA
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif  // PADDLE_WITH_CUDA
+
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind,
+                     cudaMemcpyKind,
+                     hipMemcpyKind,
+                     musaMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp,
+                     cudaDeviceProp,
+                     hipDeviceProp_t,
+                     musaDeviceProp);
+#undef DECLARE_TYPE_FOR_GPU
+
+#ifndef PADDLE_WITH_MUSA
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
 
+#else  // PADDLE_WITH_MUSA
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
-#endif
+#endif  // PADDLE_WITH_CUDA
 
-DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
-DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
 DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
 DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
 DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
@@ -50,34 +75,45 @@ DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
 DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
                      cudnnActivationMode_t,
                      miopenActivationMode_t);
-
 #undef DECLARE_TYPE_FOR_GPU
+#endif
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = ROCM_CV;
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
+  constexpr auto GPU_CV = MUSA_CV;
 #else  // PADDLE_WITH_CUDA
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = CUDA_CV;
 #endif
 
 DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
-                         hipErrorOutOfMemory);
-DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
-DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+                         hipErrorOutOfMemory,
+                         musaErrorMemoryAllocation);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady,
+                         cudaErrorNotReady,
+                         hipErrorNotReady,
+                         musaErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
 
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice,
                          cudaMemcpyKind::cudaMemcpyHostToDevice,
-                         hipMemcpyKind::hipMemcpyHostToDevice);
+                         hipMemcpyKind::hipMemcpyHostToDevice,
+                         musaMemcpyKind::musaMemcpyHostToDevice);
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost,
                          cudaMemcpyKind::cudaMemcpyDeviceToHost,
-                         hipMemcpyKind::hipMemcpyDeviceToHost);
+                         hipMemcpyKind::hipMemcpyDeviceToHost,
+                         musaMemcpyKind::musaMemcpyDeviceToHost);
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice,
                          cudaMemcpyKind::cudaMemcpyDeviceToDevice,
-                         hipMemcpyKind::hipMemcpyDeviceToDevice);
+                         hipMemcpyKind::hipMemcpyDeviceToDevice,
+                         musaMemcpyKind::musaMemcpyDeviceToDevice);
 
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace phi
 
-#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||
+        // defined(PADDLE_WITH_MUSA )
diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h
new file mode 100644
index 0000000000000..074bb2ba0cbff
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/musa_device_function.h
@@ -0,0 +1,189 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define PADDLE_CUDA_FP16
+// NOTE(): support float16 to half in header file.
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+
+#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
+  case (dim): {                            \
+    constexpr auto kPowerOfTwoDim = (dim); \
+    __VA_ARGS__;                           \
+  } break
+
+#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
+  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
+  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
+
+template <typename T>
+__forceinline__ __device__ T
+CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
+  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
+}
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
+                                                T val,
+                                                int width = warpSize) {
+  return __shfl_xor_sync(mask, val, width);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
+    unsigned mask, phi::dtype::float16 val, int delta, int width) {
+  return phi::dtype::float16(__shfl_down_sync(
+      mask, val.to_half(), static_cast<unsigned>(delta), width));
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
+    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
+#if defined(PADDLE_MUSA_BF16) && defined(__MUSA_ARCH__) && __MUSA_ARCH__ >= 220
+  return phi::dtype::bfloat16(__shfl_down_sync(
+      mask, val.to_mt_bfloat16(), static_cast<unsigned>(delta), width));
+#else
+  PADDLE_ENFORCE(
+      false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11.");
+#endif
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
+    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
+  float real = static_cast<float>(__shfl_down_sync(
+      mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
+  float imag = static_cast<float>(__shfl_down_sync(
+      mask, static_cast<float>(val.imag), static_cast<unsigned>(delta), width));
+  return phi::dtype::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
+    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
+  double real =
+      static_cast<double>(__shfl_down_sync(mask,
+                                           static_cast<double>(val.real),
+                                           static_cast<unsigned>(delta),
+                                           width));
+  double imag =
+      static_cast<double>(__shfl_down_sync(mask,
+                                           static_cast<double>(val.imag),
+                                           static_cast<unsigned>(delta),
+                                           width));
+  return phi::dtype::complex<double>(real, imag);
+}
+
+// TODO(@MTAI): there is compiling error when compiling the following code
+// template <>
+// __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
+//     unsigned mask, phi::dtype::float16 val, int width) {
+//   return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
+// }
+
+template <>
+__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
+    unsigned mask, phi::dtype::bfloat16 val, int width) {
+#if defined(PADDLE_MUSA_BF16)
+  return phi::dtype::bfloat16(
+      __shfl_xor_sync(mask, val.to_mt_bfloat16(), width));
+#else
+  PADDLE_ENFORCE(
+      false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11.");
+#endif
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
+    unsigned mask, phi::dtype::complex<float> val, int width) {
+  float real = static_cast<float>(
+      __shfl_xor_sync(mask, static_cast<float>(val.real), width));
+  float imag = static_cast<float>(
+      __shfl_xor_sync(mask, static_cast<float>(val.imag), width));
+  return phi::dtype::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
+    unsigned mask, phi::dtype::complex<double> val, int width) {
+  double real = static_cast<double>(
+      __shfl_xor_sync(mask, static_cast<double>(val.real), width));
+  double imag = static_cast<double>(
+      __shfl_xor_sync(mask, static_cast<double>(val.imag), width));
+  return phi::dtype::complex<double>(real, imag);
+}
+
+template <typename T>
+__forceinline__ __device__ T
+CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
+  return __shfl_sync(mask, val, src_line, width);
+}
+
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // NOTE(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  const int warpSize = 32;
+  __shared__ T shm[warpSize];
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+  __syncthreads();
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+  __syncthreads();
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h
new file mode 100644
index 0000000000000..cbfc458abf8da
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/musa_helper.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+#define CUDNN_VERSION_MIN(major, minor, patch) \
+  (0 >= ((major)*1000 + (minor)*100 + (patch)))
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
+       __index__ += __stride__, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc
new file mode 100644
index 0000000000000..f244601b9d9cc
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/musa_info.cc
@@ -0,0 +1,329 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <array>
+#include <mutex>
+
+#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+
+#include "paddle/phi/core/enforce.h"
+
+#include "musa_runtime.h"
+
+static std::once_flag g_device_props_size_init_flag;
+static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
+static std::vector<phi::gpuDeviceProp> g_device_props;
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+int DnnVersion() {
+  if (!dynload::HasCUDNN()) return -1;
+  // TODO(@caizhi): mudnnGetVersion is not supported now.
+  // version info will be returned from mudnnGetVersion later.
+  const int version_major = 1;
+  const int version_minor = 1;
+  const int version_patch = 0;
+  return version_major * 1000 + version_minor * 100 + version_patch;
+}
+
+static int GetGPUDeviceCountImpl() {
+  int driverVersion = 0;
+  musaError_t status = musaDriverGetVersion(&driverVersion);
+
+  if (!(status == gpuSuccess && driverVersion != 0)) {
+    // No GPU driver
+    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
+    return 0;
+  }
+
+  const auto *musa_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES");
+
+  if (musa_visible_devices != nullptr) {
+    std::string musa_visible_devices_str(musa_visible_devices);
+    if (!musa_visible_devices_str.empty()) {
+      musa_visible_devices_str.erase(
+          0, musa_visible_devices_str.find_first_not_of('\''));
+      musa_visible_devices_str.erase(
+          musa_visible_devices_str.find_last_not_of('\'') + 1);
+      musa_visible_devices_str.erase(
+          0, musa_visible_devices_str.find_first_not_of('\"'));
+      musa_visible_devices_str.erase(
+          musa_visible_devices_str.find_last_not_of('\"') + 1);
+    }
+    if (std::all_of(musa_visible_devices_str.begin(),
+                    musa_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be "
+                 "empty. No GPU detected.";
+      return 0;
+    }
+  }
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count));
+  return count;
+}
+
+int GetGPUDeviceCount() {
+  // cache the count
+  static auto dev_cnt = GetGPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetGPUComputeCapability(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int major, minor;
+  auto major_error_code =
+      musaDeviceGetAttribute(&major, musaDevAttrComputeCapabilityMajor, id);
+  auto minor_error_code =
+      musaDeviceGetAttribute(&minor, musaDevAttrComputeCapabilityMinor, id);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
+  return major * 100 + minor;
+}
+
+int GetGPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int runtime_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version));
+  return runtime_version;
+}
+
+int GetGPUDriverVersion(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int driver_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version));
+  return driver_version;
+}
+
+bool TensorCoreAvailable() { return false; }
+
+int GetGPUMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id));
+  return count;
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
+      &count, musaDevAttrMaxThreadsPerMultiProcessor, id));
+
+  return count;
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetAttribute(&count, musaDevAttrMaxThreadsPerBlock, id));
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id));
+  return device_id;
+}
+
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  std::array<int, 3> ret;
+  int size;
+  auto error_code_x = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
+  ret[0] = size;
+
+  auto error_code_y = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
+  ret[1] = size;
+
+  auto error_code_z = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
+  ret[2] = size;
+  return ret;
+}
+
+std::pair<int, int> GetGpuStreamPriorityRange() {
+  int least_priority, greatest_priority;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+  return std::make_pair(least_priority, greatest_priority);
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  std::call_once(g_device_props_size_init_flag, [&] {
+    int gpu_num = 0;
+    gpu_num = GetGPUDeviceCount();
+    g_device_props_init_flags.resize(gpu_num);
+    g_device_props.resize(gpu_num);
+    for (int i = 0; i < gpu_num; ++i) {
+      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
+    }
+  });
+
+  if (id == -1) {
+    id = GetCurrentDeviceId();
+  }
+
+  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
+    PADDLE_THROW(phi::errors::OutOfRange(
+        "The device id %d is out of range [0, %d), where %d is the number of "
+        "devices on this machine. Because the device id should be greater than "
+        "or equal to zero and smaller than the number of gpus. Please input "
+        "appropriate device again!",
+        id,
+        static_cast<int>(g_device_props.size()),
+        static_cast<int>(g_device_props.size())));
+  }
+
+  std::call_once(*(g_device_props_init_flags[id]), [&] {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaGetDeviceProperties(&g_device_props[id], id));
+  });
+
+  return g_device_props[id];
+}
+
+void SetDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id));
+}
+
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream));
+}
+
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
+                   gpuMemcpyKind kind) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind));
+}
+
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+}
+
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaMemcpyPeer(dst, dst_device, src, src_device, count));
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream));
+}
+
+void GpuStreamSync(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
+}
+
+void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); }
+
+gpuError_t GpuGetLastError() { return musaGetLastError(); }
+
+bool IsGPUManagedMemorySupported(int dev_id) {
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
+  return false;
+}
+
+bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
+  return false;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc
index 96b46fbc0d4ff..e6163e5f362d3 100644
--- a/paddle/phi/capi/lib/c_device_context.cc
+++ b/paddle/phi/capi/lib/c_device_context.cc
@@ -35,7 +35,8 @@ PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx,
         reinterpret_cast<const phi::CustomContext*>(ctx)->stream());
   } else if (dev_ctx_type == phi::AllocationType::CPU) {
     return nullptr;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (dev_ctx_type == phi::AllocationType::GPU) {
     return reinterpret_cast<PD_Stream>(
         reinterpret_cast<const phi::GPUContext*>(ctx)->stream());
diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc
index e9fe2aada1f35..63c4085eface4 100644
--- a/paddle/phi/capi/lib/c_kernel_context.cc
+++ b/paddle/phi/capi/lib/c_kernel_context.cc
@@ -30,7 +30,8 @@ PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) {
   } else if (dev_ctx_type == phi::AllocationType::CPU) {
     return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::CPUContext*>(
         &kernel_context->GetDeviceContext<phi::CPUContext>()));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (dev_ctx_type == phi::AllocationType::GPU) {
     return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::GPUContext*>(
         &kernel_context->GetDeviceContext<phi::GPUContext>()));
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 5540592d5013c..342e0a3ebe5ce 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -138,7 +138,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
   } else if (s == std::string("GPUDNN")) {
     return Backend::GPUDNN;
   } else if (s == std::string("KPS")) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     // NOTE(chenweihang) KPS is not yet a complete backend, and it still needs
     // to be converted
     // to GPU in the GPU environment
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index 7ea9b0cbb6477..d553ac9b1ff0c 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -26,6 +26,14 @@
 #include <cuda.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif
+#if defined(__MUSACC__)
+#define PADDLE_MUSA_BF16
+#include <musa_bf16.h>
+#endif
+
 #if defined(__CUDACC__) && CUDA_VERSION >= 11000
 #define PADDLE_CUDA_BF16
 #include <cuda_bf16.h>
@@ -61,6 +69,13 @@ struct PADDLE_ALIGN(2) bfloat16 {
     tempRes = reinterpret_cast<uint32_t*>(&val);
     res = *tempRes;
     x = res >> 16;
+#elif defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_MUSA_BF16)
+    __mt_bfloat16 tmp = __float2bfloat16(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
+#else
+    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+#endif
 #else
 #if defined(PADDLE_CUDA_BF16)
     __nv_bfloat16 tmp = __float2bfloat16(val);
@@ -154,6 +169,16 @@ struct PADDLE_ALIGN(2) bfloat16 {
     uint16_t* temp_ptr = reinterpret_cast<uint16_t*>(&temp);
     res = *temp_ptr;
     return res;
+#elif defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_MUSA_BF16
+    return __bfloat162float(*reinterpret_cast<const __mt_bfloat16*>(&x));
+#else
+    float val = 0.f;
+    uint16_t temp = x;
+    std::memcpy(
+        reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp), 2);
+    return val;
+#endif
 #else
 #ifdef PADDLE_CUDA_BF16
     return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
@@ -173,6 +198,12 @@ struct PADDLE_ALIGN(2) bfloat16 {
   }
 #endif
 
+#ifdef PADDLE_MUSA_BF16
+  HOSTDEVICE inline __mt_bfloat16 to_mt_bfloat16() const {
+    return *reinterpret_cast<const __mt_bfloat16*>(&x);
+  }
+#endif
+
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
   HOSTDEVICE inline explicit operator int8_t() const {
diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
index a4e003dd544ad..c4b8ad9055f87 100644
--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -26,6 +26,11 @@
 #include <thrust/complex.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <muComplex.h>
+#include <thrust/complex.h>
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_complex.h>
 #include <thrust/complex.h>  // NOLINT
@@ -37,7 +42,8 @@
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 // todo
 #define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
 #endif
@@ -62,7 +68,8 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
 
   HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
   template <typename T1>
   HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
@@ -83,6 +90,15 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
   HOSTDEVICE inline explicit operator hipDoubleComplex() const {
     return make_hipDoubleComplex(real, imag);
   }
+
+#elif defined(PADDLE_WITH_MUSA)
+  HOSTDEVICE inline explicit operator muFloatComplex() const {
+    return make_muFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator muDoubleComplex() const {
+    return make_muDoubleComplex(real, imag);
+  }
 #else
   HOSTDEVICE inline explicit operator cuFloatComplex() const {
     return make_cuFloatComplex(real, imag);
@@ -187,7 +203,7 @@ template <typename T>
 HOSTDEVICE inline complex<T> operator+(const complex<T>& a,
                                        const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::complex<T>(a) + thrust::complex<T>(b));
 #else
   return complex<T>(a.real + b.real, a.imag + b.imag);
@@ -198,7 +214,7 @@ template <typename T>
 HOSTDEVICE inline complex<T> operator-(const complex<T>& a,
                                        const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::complex<T>(a) - thrust::complex<T>(b));
 #else
   return complex<T>(a.real - b.real, a.imag - b.imag);
@@ -209,7 +225,7 @@ template <typename T>
 HOSTDEVICE inline complex<T> operator*(const complex<T>& a,
                                        const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::complex<T>(a) * thrust::complex<T>(b));
 #else
   return complex<T>(a.real * b.real - a.imag * b.imag,
@@ -221,7 +237,7 @@ template <typename T>
 HOSTDEVICE inline complex<T> operator/(const complex<T>& a,
                                        const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::complex<T>(a) / thrust::complex<T>(b));
 #else
   T denominator = b.real * b.real + b.imag * b.imag;
@@ -233,7 +249,7 @@ HOSTDEVICE inline complex<T> operator/(const complex<T>& a,
 template <typename T>
 HOSTDEVICE inline complex<T> operator-(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(-thrust::complex<T>(a.real, a.imag));
 #else
   complex<T> res;
@@ -247,7 +263,7 @@ template <typename T>
 HOSTDEVICE inline complex<T>& operator+=(complex<T>& a,  // NOLINT
                                          const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   a = complex<T>(thrust::complex<T>(a.real, a.imag) +=
                  thrust::complex<T>(b.real, b.imag));
   return a;
@@ -262,7 +278,7 @@ template <typename T>
 HOSTDEVICE inline complex<T>& operator-=(complex<T>& a,  // NOLINT
                                          const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   a = complex<T>(thrust::complex<T>(a.real, a.imag) -=
                  thrust::complex<T>(b.real, b.imag));
   return a;
@@ -277,7 +293,7 @@ template <typename T>
 HOSTDEVICE inline complex<T>& operator*=(complex<T>& a,  // NOLINT
                                          const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   a = complex<T>(thrust::complex<T>(a.real, a.imag) *=
                  thrust::complex<T>(b.real, b.imag));
   return a;
@@ -292,7 +308,7 @@ template <typename T>
 HOSTDEVICE inline complex<T>& operator/=(complex<T>& a,  // NOLINT
                                          const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   a = complex<T>(thrust::complex<T>(a.real, a.imag) /=
                  thrust::complex<T>(b.real, b.imag));
   return a;
@@ -355,7 +371,7 @@ HOSTDEVICE inline complex<T>(min)(const complex<T>& a, const complex<T>& b) {
 template <typename T>
 HOSTDEVICE inline bool(isnan)(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return ::isnan(a.real) || ::isnan(a.imag);
 #else
   return std::isnan(a.real) || std::isnan(a.imag);
@@ -365,7 +381,7 @@ HOSTDEVICE inline bool(isnan)(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline bool isinf(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return ::isinf(a.real) || ::isinf(a.imag);
 #else
   return std::isinf(a.real) || std::isinf(a.imag);
@@ -375,7 +391,7 @@ HOSTDEVICE inline bool isinf(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline bool isfinite(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return ::isfinite(a.real) || ::isfinite(a.imag);
 #else
   return std::isfinite(a.real) || std::isfinite(a.imag);
@@ -385,7 +401,7 @@ HOSTDEVICE inline bool isfinite(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline T abs(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return thrust::abs(thrust::complex<T>(a));
 #else
   return std::abs(std::complex<T>(a));
@@ -395,7 +411,7 @@ HOSTDEVICE inline T abs(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline T arg(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return thrust::arg(thrust::complex<T>(a));
 #else
   return std::arg(std::complex<T>(a));
@@ -405,7 +421,7 @@ HOSTDEVICE inline T arg(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::pow(thrust::complex<T>(a), thrust::complex<T>(b)));
 #else
   return complex<T>(std::pow(std::complex<T>(a), std::complex<T>(b)));
@@ -415,7 +431,7 @@ HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
 template <typename T>
 HOSTDEVICE inline complex<T> sqrt(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::sqrt(thrust::complex<T>(a)));
 #else
   return complex<T>(std::sqrt(std::complex<T>(a)));
@@ -425,7 +441,7 @@ HOSTDEVICE inline complex<T> sqrt(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::tanh(thrust::complex<T>(a)));
 #else
   return complex<T>(std::tanh(std::complex<T>(a)));
@@ -435,7 +451,7 @@ HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
 template <typename T>
 HOSTDEVICE inline complex<T> log(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__))
   return complex<T>(thrust::log(thrust::complex<T>(a)));
 #else
   return complex<T>(std::log(std::complex<T>(a)));
diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h
index 6783799026d44..b57b485f43bc4 100644
--- a/paddle/phi/common/cpstring_impl.h
+++ b/paddle/phi/common/cpstring_impl.h
@@ -26,7 +26,7 @@ limitations under the License. */
 
 #include "paddle/phi/core/macros.h"
 
-#if (defined(__NVCC__) || defined(__HIPCC__))
+#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
@@ -77,7 +77,8 @@ HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) {
 }
 #endif
 
-#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__))
+#if PD_PSTRING_LITTLE_ENDIAN || \
+    (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__))
 #define PD_le32toh(x) x
 #else  // PD_PSTRING_LITTLE_ENDIAN
 #define PD_le32toh(x) swap32(x)
@@ -209,7 +210,7 @@ HOSTDEVICE static inline void *PD_Malloc(size_t size) { return malloc(size); }
 HOSTDEVICE static inline void *PD_Realloc(void *ptr,
                                           size_t old_size UNUSED,
                                           size_t new_size) {
-#if (defined(__NVCC__) || defined(__HIPCC__))
+#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__))
   if (old_size >= new_size) {
     return ptr;
   }
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 86168d441ded2..75fea3d88ab0c 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -37,6 +37,10 @@
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -46,6 +50,11 @@
 #include <cuda_fp16.h>
 #endif
 
+#if defined(__MUSACC__)
+#define PADDLE_CUDA_FP16
+#include <musa_fp16.h>
+#endif
+
 #ifdef __HIPCC__
 #define PADDLE_CUDA_FP16
 #include <hip/hip_fp16.h>
@@ -82,8 +91,10 @@ struct PADDLE_ALIGN(2) float16 {
 // Constructors
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit float16(const half& h) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+    CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
     x = h.x;
@@ -101,8 +112,9 @@ struct PADDLE_ALIGN(2) float16 {
 #endif
 
   HOSTDEVICE inline explicit float16(float val) {
-#if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+#if defined(PADDLE_CUDA_FP16) &&                  \
+    (defined(__HIPCC__) || defined(__MUSACC__) || \
+     (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = __float2half(val);
     x = *reinterpret_cast<uint16_t*>(&tmp);
 
@@ -144,7 +156,8 @@ struct PADDLE_ALIGN(2) float16 {
 // Assignment operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline float16& operator=(const half& rhs) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+    CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
 #else
     x = rhs.x;
@@ -218,7 +231,8 @@ struct PADDLE_ALIGN(2) float16 {
 // Conversion operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline half to_half() const {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+    CUDA_VERSION >= 9000
     __half_raw h;
     h.x = x;
     return half(h);
@@ -237,8 +251,9 @@ struct PADDLE_ALIGN(2) float16 {
 #endif
 
   HOSTDEVICE inline operator float() const {
-#if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+#if defined(PADDLE_CUDA_FP16) &&                  \
+    (defined(__HIPCC__) || defined(__MUSACC__) || \
+     (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = *reinterpret_cast<const half*>(this);
     return __half2float(tmp);
 
@@ -395,7 +410,7 @@ DEVICE inline half operator-(const half& a) {
 #endif
 }
 
-#ifndef PADDLE_WITH_HIP  // not defined __HIP_NO_HALF_OPERATORS__
+#ifdef PADDLE_WITH_CUDA  // not defined __HIP_NO_HALF_OPERATORS__
 DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
   a = a + b;
   return a;
@@ -1017,6 +1032,7 @@ struct is_floating_point<phi::dtype::float16>
           std::is_same<
               phi::dtype::float16,
               typename std::remove_cv<phi::dtype::float16>::type>::value> {};
+
 template <>
 struct is_signed<phi::dtype::float16> {
   static const bool value = true;
diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc
index f9ef606049297..cf4c3ca12869d 100644
--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -69,7 +69,8 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
                                                               dev_id);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void GpuMemoryUsage(size_t* available, size_t* total) {
   return MemoryUtils::Instance().GpuMemoryUsage(available, total);
 }
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
index f6a4afcea2f78..0aa0c745501ec 100644
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -118,7 +118,8 @@ struct MemoryInterface {
   int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
                                               int dev_id);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   /**
    * @brief get the memory usage of current GPU device.
    *
@@ -271,7 +272,8 @@ class MemoryUtils {
     return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void GpuMemoryUsage(size_t* available, size_t* total) {
     CheckMemoryMethod();
     PADDLE_ENFORCE_NOT_NULL(
@@ -372,7 +374,8 @@ void Copy(const Place& dst_place,
 
 int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void GpuMemoryUsage(size_t* available, size_t* total);
 #endif
 
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index fe15be4b2b909..0f8d7a173ad52 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -123,7 +123,8 @@ static int8_t GetCorrectDeviceIdByPlaceType(
   switch (place_type) {
     case paddle::PlaceType::kCPU:
       return 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     case paddle::PlaceType::kGPU:
       return phi::backends::gpu::GetCurrentDeviceId();
 #endif
@@ -169,7 +170,8 @@ bool operator==(PlaceType place_type, const Place &place) {
 
 GPUPlace DefaultGPUPlace() {
   return GPUPlace(
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       phi::backends::gpu::GetCurrentDeviceId());
 #else
       0);
diff --git a/paddle/phi/common/transform.h b/paddle/phi/common/transform.h
index e80561284b885..620d3d683fbf0 100644
--- a/paddle/phi/common/transform.h
+++ b/paddle/phi/common/transform.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
 #include "thrust/device_ptr.h"
@@ -92,7 +92,7 @@ struct Transform<phi::CPUContext> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 // PointerToThrustDevicePtr has two specializations, one casts a (CUDA
 // device) pointer into thrust::device_ptr, the other keeps rest types
@@ -153,6 +153,12 @@ struct Transform<phi::GPUContext> {
                       CastToCUDATransformIterator(last),
                       CastToCUDATransformIterator(result),
                       op);
+#elif defined(__MUSACC__)
+    thrust::transform(thrust::musa::par.on(context.stream()),
+                      CastToCUDATransformIterator(first),
+                      CastToCUDATransformIterator(last),
+                      CastToCUDATransformIterator(result),
+                      op);
 #else
     thrust::transform(thrust::cuda::par.on(context.stream()),
                       CastToCUDATransformIterator(first),
@@ -184,6 +190,13 @@ struct Transform<phi::GPUContext> {
                       CastToCUDATransformIterator(first2),
                       CastToCUDATransformIterator(result),
                       op);
+#elif defined(__MUSACC__)
+    thrust::transform(thrust::musa::par.on(context.stream()),
+                      CastToCUDATransformIterator(first1),
+                      CastToCUDATransformIterator(last1),
+                      CastToCUDATransformIterator(first2),
+                      CastToCUDATransformIterator(result),
+                      op);
 #else
     thrust::transform(thrust::cuda::par.on(context.stream()),
                       CastToCUDATransformIterator(first1),
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 947c7fb45c5fc..24eb8115e970f 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -57,7 +57,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
   switch (backend) {
     case phi::Backend::CPU:
       return phi::CPUPlace();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     case phi::Backend::GPU:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
@@ -66,7 +67,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
     case phi::Backend::ONEDNN:
       return phi::CPUPlace();
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     case phi::Backend::GPUDNN:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
@@ -77,7 +79,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
           set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
 #endif
     case phi::Backend::KPS:
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #elif defined(PADDLE_WITH_XPU_KP)
diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h
index b27770b081433..26ec22f103a90 100644
--- a/paddle/phi/core/cuda_stream.h
+++ b/paddle/phi/core/cuda_stream.h
@@ -28,6 +28,11 @@ using gpuStream_t = cudaStream_t;
 using gpuStream_t = hipStream_t;
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+using gpuStream_t = musaStream_t;
+#endif
+
 #include "glog/logging.h"
 
 #include "paddle/phi/core/enforce.h"
@@ -73,6 +78,9 @@ class CUDAStream {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority(
         &stream, static_cast<unsigned int>(flag), priority));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreateWithPriority(
+        &stream, static_cast<unsigned int>(flag), priority));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority(
         &stream, static_cast<unsigned int>(flag), priority));
@@ -92,6 +100,8 @@ class CUDAStream {
       backends::gpu::GPUDeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(raw_stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(raw_stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(raw_stream()));
 #endif
@@ -112,6 +122,14 @@ class CUDAStream {
     if (err == hipErrorNotReady) {
       return false;
     }
+#elif defined(PADDLE_WITH_MUSA)
+    musaError_t err = musaStreamQuery(raw_stream());
+    if (err == musaSuccess) {
+      return true;
+    }
+    if (err == musaErrorNotReady) {
+      return false;
+    }
 #else
     cudaError_t err = cudaStreamQuery(raw_stream());
     if (err == cudaSuccess) {
@@ -134,6 +152,8 @@ class CUDAStream {
   void WaitEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(raw_stream(), ev, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(raw_stream(), ev, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(raw_stream(), ev, 0));
 #endif
@@ -146,6 +166,8 @@ class CUDAStream {
       backends::gpu::GPUDeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
       hipStreamDestroy(raw_stream());
+#elif defined(PADDLE_WITH_MUSA)
+      musaStreamDestroy(raw_stream());
 #else
       cudaStreamDestroy(raw_stream());
 #endif
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 6b98fd0488595..4adac10dd658f 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -35,6 +35,17 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <mublas.h>
+#include <mudnn.h>
+#include <mufft.h>
+#include <murand.h>
+#include <musparse.h>
+#include <thrust/system/musa/error.h>
+#include <thrust/system_error.h>
+using mudnnStatus_t = ::musa::dnn::Status;
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <miopen/miopen.h>
@@ -75,6 +86,17 @@ limitations under the License. */
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
+#include "paddle/phi/backends/dynload/murand.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#include <error.h>
+
+#include "paddle/phi/backends/dynload/mccl.h"
+#endif  // __APPLE__
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hipfft.h"
 #include "paddle/phi/backends/dynload/hiprand.h"
@@ -90,7 +112,8 @@ limitations under the License. */
 // Note: these headers for simplify demangle type string
 #include "paddle/phi/core/type_defs.h"
 // Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_types.h"
 #endif
 
@@ -391,6 +414,17 @@ struct EnforceNotMet : public std::exception {
       abort();                                                     \
     }                                                              \
   } while (0)
+#elif defined(__MUSACC__)
+#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)               \
+  do {                                                             \
+    if (!(_IS_NOT_ERROR)) {                                        \
+      printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             #_IS_NOT_ERROR,                                       \
+             ##__VA_ARGS__);                                       \
+    }                                                              \
+  } while (0)
 #else
 #define PADDLE_ENFORCE(COND, ...)                               \
   do {                                                          \
@@ -830,6 +864,273 @@ inline void retry_sleep(unsigned milliseconds) {
 #undef DEFINE_EXTERNAL_API_TYPE
 #endif  // PADDLE_WITH_CUDA
 
+/************************************************************************/
+/**************************** MUSA ERROR ********************************/
+#ifdef PADDLE_WITH_MUSA
+
+namespace details {
+
+template <typename T>
+struct ExternalApiType {};
+
+#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
+  template <>                                         \
+  struct ExternalApiType<type> {                      \
+    using Type = type;                                \
+    static constexpr Type kSuccess = success_value;   \
+  }
+
+DEFINE_EXTERNAL_API_TYPE(musaError_t, musaSuccess);
+DEFINE_EXTERNAL_API_TYPE(murandStatus_t, MURAND_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(mudnnStatus_t, ::musa::dnn::Status::SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(mublasStatus_t, MUBLAS_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(musparseStatus_t, MUSPARSE_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(mufftResult_t, MUFFT_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(MUresult, MUSA_SUCCESS);
+
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+DEFINE_EXTERNAL_API_TYPE(mcclResult_t, mcclSuccess);
+#endif
+
+}  // namespace details
+
+/*************** MUSA ERROR ***************/
+inline bool is_error(musaError_t e) { return e != musaSuccess; }
+
+inline std::string build_musa_error_msg(musaError_t e) {
+  std::ostringstream sout;
+  sout << "MUSA error(" << e << "), " << musaGetErrorString(e) << ". ";
+  return sout.str();
+}
+
+/*************** MURAND ERROR ***************/
+inline bool is_error(murandStatus_t stat) {
+  return stat != MURAND_STATUS_SUCCESS;
+}
+
+inline const char* murandGetErrorString(murandStatus_t stat) {
+  switch (stat) {
+    case MURAND_STATUS_SUCCESS:
+      return "MURAND_STATUS_SUCCESS";
+    case MURAND_STATUS_VERSION_MISMATCH:
+      return "MURAND_STATUS_VERSION_MISMATCH";
+    case MURAND_STATUS_NOT_CREATED:
+      return "MURAND_STATUS_NOT_CREATED";
+    case MURAND_STATUS_ALLOCATION_FAILED:
+      return "MURAND_STATUS_ALLOCATION_FAILED";
+    case MURAND_STATUS_TYPE_ERROR:
+      return "MURAND_STATUS_TYPE_ERROR";
+    case MURAND_STATUS_OUT_OF_RANGE:
+      return "MURAND_STATUS_OUT_OF_RANGE";
+    case MURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "MURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case MURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "MURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case MURAND_STATUS_LAUNCH_FAILURE:
+      return "MURAND_STATUS_LAUNCH_FAILURE";
+    case MURAND_STATUS_INTERNAL_ERROR:
+      return "MURAND_STATUS_INTERNAL_ERROR";
+    case MURAND_STATUS_NOT_IMPLEMENTED:
+      return "MURAND_STATUS_NOT_IMPLEMENTED";
+    default:
+      return "Unknown murand status";
+  }
+}
+
+inline std::string build_musa_error_msg(murandStatus_t stat) {
+  std::ostringstream sout;
+  sout << "MURAND error: " << murandGetErrorString(stat) << ".";
+  return sout.str();
+}
+
+/*************** MUBLAS ERROR ***************/
+inline bool is_error(mublasStatus_t stat) {
+  return stat != MUBLAS_STATUS_SUCCESS;
+}
+
+inline const char* mublasGetErrorString(mublasStatus_t stat) {
+  switch (stat) {
+    case MUBLAS_STATUS_SUCCESS:
+      return "MUBLAS_STATUS_SUCCESS";
+    case MUBLAS_STATUS_INVALID_HANDLE:
+      return "MUBLAS_STATUS_INVALID_HANDLE";
+    case MUBLAS_STATUS_NOT_IMPLEMENTED:
+      return "MUBLAS_STATUS_NOT_IMPLEMENTED";
+    case MUBLAS_STATUS_INVALID_POINTER:
+      return "MUBLAS_STATUS_INVALID_POINTER";
+    case MUBLAS_STATUS_INVALID_SIZE:
+      return "MUBLAS_STATUS_INVALID_SIZE";
+    case MUBLAS_STATUS_MEMORY_ERROR:
+      return "MUBLAS_STATUS_MEMORY_ERROR";
+    case MUBLAS_STATUS_INTERNAL_ERROR:
+      return "MUBLAS_STATUS_INTERNAL_ERROR";
+    case MUBLAS_STATUS_PERF_DEGRADED:
+      return "MUBLAS_STATUS_PERF_DEGRADED";
+    case MUBLAS_STATUS_SIZE_QUERY_MISMATCH:
+      return "MUBLAS_STATUS_SIZE_QUERY_MISMATCH";
+    case MUBLAS_STATUS_SIZE_INCREASED:
+      return "MUBLAS_STATUS_SIZE_INCREASED";
+    case MUBLAS_STATUS_SIZE_UNCHANGED:
+      return "MUBLAS_STATUS_SIZE_UNCHANGED";
+    case MUBLAS_STATUS_INVALID_VALUE:
+      return "MUBLAS_STATUS_INVALID_VALUE";
+    case MUBLAS_STATUS_CONTINUE:
+      return "MUBLAS_STATUS_CONTINUE";
+    default:
+      return "Unknown mublas status";
+  }
+}
+inline std::string build_musa_error_msg(mublasStatus_t stat) {
+  std::ostringstream sout;
+  sout << "MUBLAS error: " << mublasGetErrorString(stat) << ".";
+  return sout.str();
+}
+
+/*************** MUSPARSE ERROR ***************/
+inline bool is_error(musparseStatus_t stat) {
+  return stat != MUSPARSE_STATUS_SUCCESS;
+}
+
+inline const char* musparseGetErrorString(musparseStatus_t stat) {
+  switch (stat) {
+    case MUSPARSE_STATUS_SUCCESS:
+      return "MUSPARSE_STATUS_SUCCESSS";
+    case MUSPARSE_STATUS_INVALID_HANDLE:
+      return "MUSPARSE_STATUS_INVALID_HANDLE";
+    case MUSPARSE_STATUS_NOT_IMPLEMENTED:
+      return "MUSPARSE_STATUS_NOT_IMPLEMENTED";
+    case MUSPARSE_STATUS_INVALID_POINTER:
+      return "MUSPARSE_STATUS_INVALID_POINTER";
+    case MUSPARSE_STATUS_INVALID_SIZE:
+      return "MUSPARSE_STATUS_INVALID_SIZE";
+    case MUSPARSE_STATUS_MEMORY_ERROR:
+      return "MUSPARSE_STATUS_MEMORY_ERROR";
+    case MUSPARSE_STATUS_INTERNAL_ERROR:
+      return "MUSPARSE_STATUS_INTERNAL_ERROR";
+    case MUSPARSE_STATUS_INVALID_VALUE:
+      return "MUSPARSE_STATUS_INVALID_VALUE";
+    case MUSPARSE_STATUS_ARCH_MISMATCH:
+      return "MUSPARSE_STATUS_ARCH_MISMATCH";
+    case MUSPARSE_STATUS_ZERO_PIVOT:
+      return "MUSPARSE_STATUS_ZERO_PIVOT";
+    case MUSPARSE_STATUS_NOT_INITIALIZED:
+      return "MUSPARSE_STATUS_NOT_INITIALIZED";
+    case MUSPARSE_STATUS_TYPE_MISMATCH:
+      return "MUSPARSE_STATUS_TYPE_MISMATCH";
+    case MUSPARSE_STATUS_REQUIRES_SORTED_STORAGE:
+      return "MUSPARSE_STATUS_REQUIRES_SORTED_STORAGE";
+    default:
+      return "Unknown musparse status";
+  }
+}
+
+inline std::string build_musa_error_msg(musparseStatus_t stat) {
+  std::ostringstream sout;
+  sout << "MUSparse error: " << musparseGetErrorString(stat) << ".";
+  return sout.str();
+}
+
+/**************** MCCL ERROR ****************/
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+inline bool is_error(mcclResult_t mccl_result) {
+  return mccl_result != mcclSuccess;
+}
+
+inline std::string build_musa_error_msg(mcclResult_t mccl_result) {
+  std::ostringstream sout;
+  sout << "MCCL error(" << mccl_result << "), "
+       << phi::dynload::mcclGetErrorString(mccl_result) << ". ";
+  if (errno == ENOSPC || errno == EAGAIN) {
+    std::string detail(strerror(errno));
+    detail += "\nPlease try one of the following solutions:";
+    detail += "\n1. export MCCL_SHM_DISABLE=1;";
+    detail += "\n2. export MCCL_P2P_LEVEL=SYS;";
+    detail +=
+        "\n3. Increase shared memory by setting the -shm-size "
+        "option when starting docker container, e.g., setting "
+        " -shm-size=2g.\n";
+    sout << " Detail: " + detail;
+  }
+  return sout.str();
+}
+#endif  // not(__APPLE__) and PADDLE_WITH_MCCL
+
+#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                   \
+  do {                                                     \
+    auto __cond__ = (COND);                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);       \
+    constexpr auto __success_type__ =                      \
+        ::phi::enforce::details::ExternalApiType<          \
+            __CUDA_STATUS_TYPE__>::kSuccess;               \
+    if (UNLIKELY(__cond__ != __success_type__)) {          \
+      auto __summary__ = phi::errors::External(            \
+          ::phi::enforce::build_musa_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);               \
+    }                                                      \
+  } while (0)
+
+#define PADDLE_WARN_GPU_SUCCESS(COND)                      \
+  do {                                                     \
+    auto __cond__ = (COND);                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);       \
+    constexpr auto __success_type__ =                      \
+        ::phi::enforce::details::ExternalApiType<          \
+            __CUDA_STATUS_TYPE__>::kSuccess;               \
+    if (UNLIKELY(__cond__ != __success_type__)) {          \
+      ::phi::enforce::ThrowWarnInternal(                   \
+          ::phi::enforce::build_musa_error_msg(__cond__)); \
+    }                                                      \
+  } while (0)
+
+#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP)                              \
+  do {                                                                      \
+    auto res = musaGetLastError();                                          \
+    if (UNLIKELY(res != musaSuccess)) {                                     \
+      auto msg = ::phi::enforce::build_musa_error_msg(res);                 \
+      PADDLE_THROW(                                                         \
+          phi::errors::Fatal("MUSA error after kernel (%s): %s", OP, msg)); \
+    }                                                                       \
+  } while (0)
+
+inline void retry_sleep(unsigned milliseconds) {
+#ifdef _WIN32
+  Sleep(milliseconds);
+#else
+  if (milliseconds < 1000) {
+    // usleep argument must be less than 1,000,000. Reference:
+    // https://pubs.opengroup.org/onlinepubs/7908799/xsh/usleep.html
+    usleep(milliseconds * 1000);
+  } else {
+    // clip to sleep in seconds because we can not and don't have to
+    // sleep for exact milliseconds
+    sleep(milliseconds / 1000);
+  }
+#endif
+}
+
+#define PADDLE_RETRY_CUDA_SUCCESS(COND)                                 \
+  do {                                                                  \
+    auto __cond__ = (COND);                                             \
+    int retry_count = 1;                                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                    \
+    constexpr auto __success_type__ =                                   \
+        ::phi::enforce::details::ExternalApiType<                       \
+            __CUDA_STATUS_TYPE__>::kSuccess;                            \
+    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
+      phi::enforce::retry_sleep(10000);                                 \
+      __cond__ = (COND);                                                \
+      ++retry_count;                                                    \
+    }                                                                   \
+    if (UNLIKELY(__cond__ != __success_type__)) {                       \
+      auto __summary__ = phi::errors::External(                         \
+          ::phi::enforce::build_musa_error_msg(__cond__));              \
+      __THROW_ERROR_INTERNAL__(__summary__);                            \
+    }                                                                   \
+  } while (0)
+
+#undef DEFINE_EXTERNAL_API_TYPE
+#endif  // PADDLE_WITH_MUSA
+
 /**************************************************************************/
 /***************************** HIP ERROR **********************************/
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index 0c581fb09919f..ec6ac698cf567 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -14,7 +14,8 @@
 // limitations under the License.
 
 #include "paddle/phi/core/flags.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
 #endif
 
@@ -120,7 +121,8 @@ PHI_DEFINE_EXPORTED_bool(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 /**
  * CUDA related related FLAG
@@ -215,7 +217,8 @@ PHI_DEFINE_EXPORTED_bool(
     true,
     "Whether enable api kernel fallback to CPU one when not found");
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 /**
  * CUDNN related FLAG
  * Name: FLAGS_cudnn_deterministic
@@ -322,7 +325,8 @@ PHI_DEFINE_EXPORTED_bool(
     "batch_norm, default is False.");
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 /**
  * NCCL related FLAG
@@ -541,8 +545,9 @@ PHI_DEFINE_EXPORTED_double(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||           \
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE) || \
+    defined(PADDLE_WITH_XPU)
 
 /**
  * Memory related FLAG
@@ -785,7 +790,8 @@ PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off,
  * Example:
  * Note: Check kernel launch status after every kernel compute.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(
     check_kernel_launch,
     false,
@@ -800,7 +806,8 @@ PHI_DEFINE_EXPORTED_bool(
  * Example:
  * Note: Disable cudnn in conv2d.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                          false,
                          "Disable cudnn in conv2d");
@@ -1127,7 +1134,8 @@ PHI_DEFINE_EXPORTED_bool(gpugraph_debug_gpu_memory,
  * Example:
  * Note: nccl blocking wait.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
 #endif
 
diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc
index 4ed25af0814df..d5e10d9c2d006 100644
--- a/paddle/phi/core/generator.cc
+++ b/paddle/phi/core/generator.cc
@@ -63,7 +63,8 @@ const std::shared_ptr<Generator>& DefaultXPUGenerator(int64_t device_id) {
 }
 
 const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
   static int64_t num_cuda_devices = -1;
   static std::once_flag num_devices_init_flag;
@@ -278,7 +279,8 @@ uint64_t Generator::Random64() {
 
 std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
     uint64_t increment_offset) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   std::lock_guard<std::mutex> lock(this->mu_);
   uint64_t cur_offset = this->state_.thread_offset;
   this->state_.thread_offset += increment_offset;
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index decebbe66a538..85feb0d060439 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -18,6 +18,10 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#ifdef __MUSACC__
+#include <musa_runtime.h>
+#endif
+
 #if defined(__xpu__)
 #include <xpu/runtime.h>
 
@@ -26,7 +30,8 @@
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+     defined(__xpu__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 6511efa0152ee..dc0134da132dc 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -120,7 +120,8 @@ const Kernel& KernelFactory::SelectKernelWithGPUDNN(
     return empty_kernel;
   }
   KernelKey kernel_key = KernelKey(const_kernel_key);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == Backend::GPUDNN) {
     auto kernel_iter = iter->second.find(
         {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()});
@@ -221,7 +222,8 @@ KernelResult KernelFactory::SelectKernelOrThrowError(
   KernelKey kernel_key = KernelKey(const_kernel_key.backend(),
                                    phi::DataLayout::ALL_LAYOUT,
                                    const_kernel_key.dtype());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == Backend::GPUDNN) {
     auto kernel_iter = iter->second.find(
         {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()});
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index f4e021f7269a7..ce795bf781577 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -60,7 +60,8 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
 #if defined(PADDLE_WITH_MKLDNN)
           || arg_type == std::type_index(typeid(const OneDNNContext&))
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
           || arg_type == std::type_index(typeid(const GPUContext&))
 #elif defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
           || arg_type == std::type_index(typeid(const XPUContext&))
@@ -1401,7 +1402,8 @@ struct KernelRegistrar {
                                             meta_kernel_fn,        \
                                             BACKEND_LIST)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #define _DEVICE GPU,
 #elif defined(PADDLE_WITH_XPU)
 #define _DEVICE XPU,
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index f4dc4636bdde3..1aad3dd59611e 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -273,7 +273,8 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   /* DeviceContext Helpers */
 
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h
index 2e78357492734..f3dae52b04387 100644
--- a/paddle/phi/core/macros.h
+++ b/paddle/phi/core/macros.h
@@ -53,7 +53,7 @@ namespace phi {
 #define PD_CONCATENATE2(arg1, arg2) arg1##arg2
 #define PD_EXPAND(x) x
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #define PADDLE_RESTRICT __restrict__
 #else
 #define PADDLE_RESTRICT
diff --git a/paddle/phi/core/mixed_vector.cc b/paddle/phi/core/mixed_vector.cc
index 857bd546befcd..778ec44c28ee3 100644
--- a/paddle/phi/core/mixed_vector.cc
+++ b/paddle/phi/core/mixed_vector.cc
@@ -33,7 +33,8 @@ template <typename T>
 void CopyToCPUHelper(std::vector<T> *cpu_,
                      phi::Allocator::AllocationPtr *gpu_,
                      size_t *gpu_memory_size_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // COPY GPU Data To CPU
   auto *dev_ctx = static_cast<phi::GPUContext *>(
       phi::DeviceContextPool::Instance().Get((*gpu_)->place()));
@@ -55,7 +56,8 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
                              phi::Allocator::AllocationPtr *gpu_,
                              size_t *gpu_memory_size_,
                              const phi::Place &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   void *src = cpu_->data();
   *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
   (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 0e465982ba429..c8b14db5615ed 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -114,9 +114,12 @@ void StringTensor::init_holder() {
   if (place.GetType() == phi::AllocationType::CPU) {
     std::memset(ptr, 0, bytes_size);
   } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_HIP
     hipMemset(ptr, 0, bytes_size);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(ptr, 0, bytes_size);
 #else
     cudaMemset(ptr, 0, bytes_size);
 #endif
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index abe44d3e2550b..e605673ea78e7 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -58,7 +58,8 @@ void Copy(const Context& dev_ctx,
 #ifdef PADDLE_WITH_MKLDNN
     dst->set_layout(src.layout());
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if (dst_place.GetType() == AllocationType::GPU ||
              dst_place.GetType() == AllocationType::GPUPINNED) {
     dst_ptr = dev_ctx.Alloc(
@@ -99,7 +100,8 @@ void Copy(const Context& dev_ctx,
   if (src_place.GetType() == AllocationType::CPU &&
       dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   } else if ((src_place.GetType() == AllocationType::CPU ||
               src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
              (dst_place.GetType() == AllocationType::CPU ||
@@ -386,7 +388,8 @@ template void Copy(const DeviceContext& dev_ctx,
                    bool blocking,
                    TensorArray* dst);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template void Copy(const GPUContext& dev_ctx,
                    const DenseTensor& src,
                    Place dst_place,
@@ -468,7 +471,8 @@ void TensorFromVector(const std::vector<T>& src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -522,7 +526,8 @@ void TensorFromVector(const std::vector<bool>& src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -614,7 +619,8 @@ void TensorFromArray(const T* src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -714,7 +720,8 @@ void TensorToVector(const phi::DenseTensor& src,
   if (src.place().GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -756,7 +763,8 @@ void TensorToVector(const phi::DenseTensor& src,
   if (src.place().GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
diff --git a/paddle/phi/core/utils/array.h b/paddle/phi/core/utils/array.h
index 44290b73737fb..2ebf2f933b77a 100644
--- a/paddle/phi/core/utils/array.h
+++ b/paddle/phi/core/utils/array.h
@@ -54,7 +54,7 @@ class Array {
   }
 
   HOSTDEVICE inline T &at(size_t i) {
-#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) && !defined(__MUSACC__)
     PADDLE_ENFORCE_LT(
         i, N, phi::errors::OutOfRange("Array index out of bounds."));
 #endif
@@ -62,7 +62,7 @@ class Array {
   }
 
   HOSTDEVICE inline const T &at(size_t i) const {
-#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) && !defined(__MUSACC__)
     PADDLE_ENFORCE_LT(
         i, N, phi::errors::OutOfRange("Array index out of bounds."));
 #endif
@@ -103,7 +103,7 @@ class Array<T, 0> {
   HOSTDEVICE inline T *GetMutable() { return nullptr; }
 
   HOSTDEVICE inline T &operator[](size_t) {
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
+#if defined(__HIPCC__) || defined(__CUDA_ARCH__) || defined(__MUSA_ARCH__)
     // HIP and CUDA will have compile error, if use "obj()"
     // function declared in block scope cannot have 'static' storage class
     static T obj{};
@@ -114,7 +114,7 @@ class Array<T, 0> {
   }
 
   HOSTDEVICE inline const T &operator[](size_t) const {
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
+#if defined(__HIPCC__) || defined(__CUDA_ARCH__) || defined(__MUSA_ARCH__)
     // HIP and CUDA will have compile error, if use "obj()"
     // function declared in block scope cannot have 'static' storage class
     static const T obj{};
diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc
index 2a554525024c8..9a7dc398f2f7f 100644
--- a/paddle/phi/core/utils/type_info.cc
+++ b/paddle/phi/core/utils/type_info.cc
@@ -61,11 +61,12 @@ template class TypeInfoTraits<phi::TensorBase,
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU_KP)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU_KP)
 template class TypeInfoTraits<phi::DeviceContext, GPUContext>;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template class TypeInfoTraits<phi::DeviceContext, GPUPinnedContext>;
 #endif
 
diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h
index 6318b17647cd6..874e4ebcaa37b 100644
--- a/paddle/phi/core/utils/visit_place.h
+++ b/paddle/phi/core/utils/visit_place.h
@@ -25,7 +25,8 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
                                          const Visitor& visitor) {
   switch (place.GetType()) {
     case phi::AllocationType::GPU: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       phi::GPUPlace p(place.GetDeviceId());
       return visitor(p);
 #else
@@ -35,7 +36,8 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
 #endif
     }
     case phi::AllocationType::GPUPINNED: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       phi::GPUPinnedPlace p;
       return visitor(p);
 #else
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 71bbfaa333a0a..818a0698069bd 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -858,7 +858,8 @@ void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
     size_of_dtype = phi::SizeOf(dtype);
   }
   if (config.is_runtime) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     int64_t numel = 0;
     for (size_t i = 0; i < input.size(); ++i) {
       const auto& dim = input[i]->dims();
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 25367be206139..623e66bc137b8 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -40,6 +40,85 @@ file(
   "strings/gpu/*.cu"
   "fusion/gpu/*.cu")
 
+# FIXME(@MTAI): compilation error will occur when compiling the following files.
+# This need to be fixed later.
+if(WITH_MUSA)
+  list(
+    REMOVE_ITEM
+    kernel_cu
+    "fusion/gpu/fused_softmax_mask_grad_kernel.cu"
+    "fusion/gpu/fused_softmax_mask_kernel.cu"
+    "gpu/batch_norm_grad_kernel.cu"
+    "gpu/batch_norm_kernel.cu"
+    "gpu/cholesky_grad_kernel.cu"
+    "gpu/cholesky_solve_grad_kernel.cu"
+    "gpu/conv_grad_kernel.cu"
+    "gpu/conv_kernel.cu"
+    "gpu/cross_entropy_grad_kernel.cu"
+    "gpu/cross_entropy_kernel.cu"
+    "gpu/conv_transpose_grad_kernel.cu"
+    "gpu/conv_transpose_kernel.cu"
+    "gpu/cudnn_lstm_grad_kernel.cu"
+    "gpu/cudnn_lstm_kernel.cu"
+    "gpu/depthwise_conv_grad_kernel.cu"
+    "gpu/depthwise_conv_kernel.cu"
+    "gpu/dist_kernel.cu"
+    "gpu/elementwise_divide_grad_kernel.cu"
+    "gpu/elementwise_grad_kernel.cu"
+    "gpu/elementwise_multiply_grad_kernel.cu"
+    "gpu/erfinv_kernel.cu"
+    "gpu/exponential_kernel.cu"
+    "gpu/fft_grad_kernel.cu"
+    "gpu/fft_kernel.cu"
+    "gpu/fused_softmax_mask_grad_kernel.cu"
+    "gpu/gaussian_kernel.cu"
+    "gpu/gelu_grad_kernel.cu"
+    "gpu/gelu_kernel.cu"
+    "gpu/histogram_kernel.cu"
+    "gpu/instance_norm_grad_kernel.cu"
+    "gpu/instance_norm_kernel.cu"
+    "gpu/interpolate_grad_kernel.cu"
+    "gpu/kthvalue_grad_kernel.cu"
+    "gpu/kthvalue_kernel.cu"
+    "gpu/layer_norm_grad_kernel.cu"
+    "gpu/layer_norm_kernel.cu"
+    "gpu/llm_int8_mat_mul_kernel.cu"
+    "gpu/log_softmax_grad_kernel.cu"
+    "gpu/log_softmax_kernel.cu"
+    "gpu/lstsq_kernel.cu"
+    "gpu/nanmedian_kernel.cu"
+    "gpu/rnn_grad_kernel.cu.cc"
+    "gpu/rnn_kernel.cu.cc"
+    "gpu/slogdeterminant_grad_kernel.cu"
+    "gpu/softmax_grad_kernel.cu"
+    "gpu/softmax_kernel.cu"
+    "gpu/solve_grad_kernel.cu"
+    "gpu/solve_kernel.cu"
+    "gpu/spectral_norm_grad_kernel.cu"
+    "gpu/spectral_norm_kernel.cu"
+    "gpu/stft_kernel.cu"
+    "gpu/svd_grad_kernel.cu"
+    "gpu/top_k_grad_kernel.cu"
+    "gpu/top_k_kernel.cu"
+    "gpu/truncated_gaussian_random_kernel.cu"
+    "gpudnn/affine_grid_grad_kernel.cu"
+    "gpudnn/affine_grid_kernel.cu"
+    "gpudnn/softmax_grad_kernel.cu"
+    "gpudnn/softmax_kernel.cu"
+    "gpudnn/conv_grad_kernel.cu"
+    "gpudnn/conv_kernel.cu"
+    "gpudnn/conv_transpose_grad_kernel.cu"
+    "gpudnn/conv_transpose_kernel.cu"
+    "gpudnn/pool_grad_kernel.cu"
+    "gpudnn/pool_kernel.cu"
+    "sparse/gpu/softmax_grad_kernel.cu"
+    "sparse/gpu/softmax_kernel.cu"
+    "sparse/gpu/conv_kernel.cu"
+    "sparse/gpu/pool_kernel.cu"
+    "strings/gpu/strings_copy_kernel.cu"
+    "strings/gpu/strings_lower_upper_kernel.cu")
+endif()
+
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
 endif()
@@ -117,7 +196,9 @@ file(
   "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
   "sparse/xpu/*.cc")
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   collect_srcs(kernels_srcs SRCS ${kernel_cu})
   kernel_declare("${kernel_cu}")
 endif()
diff --git a/paddle/phi/kernels/activation_kernel.cc b/paddle/phi/kernels/activation_kernel.cc
index f157c5e054bfb..9dffd348ec62b 100644
--- a/paddle/phi/kernels/activation_kernel.cc
+++ b/paddle/phi/kernels/activation_kernel.cc
@@ -32,7 +32,8 @@ using complex128 = ::phi::dtype::complex<double>;
 
 PD_REGISTER_KERNEL(relu6, CPU, ALL_LAYOUT, phi::Relu6Kernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(relu6,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index db30ec7389619..73fc6b4100cb4 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -135,7 +135,8 @@ PD_REGISTER_KERNEL(assign_value,
                    int8_t,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 87eca2613a7b5..3817e62791c47 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -23,6 +23,9 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
 
 namespace phi {
 
@@ -32,6 +35,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventCreate(&start_);
     hipEventCreate(&stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventCreate(&start_);
+    musaEventCreate(&stop_);
 #else
     cudaEventCreate(&start_);
     cudaEventCreate(&stop_);
@@ -46,6 +52,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(start_);
     hipEventDestroy(stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventDestroy(start_);
+    musaEventDestroy(stop_);
 #else
     cudaEventDestroy(start_);
     cudaEventDestroy(stop_);
@@ -55,6 +64,8 @@ class GpuTimer {
   void Start(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(start_, stream);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventRecord(start_, stream);
 #else
     cudaEventRecord(start_, stream);
 #endif
@@ -63,6 +74,8 @@ class GpuTimer {
   void Stop(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(stop_, stream);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventRecord(stop_, stream);
 #else
     cudaEventRecord(stop_, stream);
 #endif
@@ -73,6 +86,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventSynchronize(stop_);
     hipEventElapsedTime(&milliseconds, start_, stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventSynchronize(stop_);
+    musaEventElapsedTime(&milliseconds, start_, stop_);
 #else
     cudaEventSynchronize(stop_);
     cudaEventElapsedTime(&milliseconds, start_, stop_);
diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc
index 6e496a355302f..661b287071fc5 100644
--- a/paddle/phi/kernels/check_memory_continue_kernel.cc
+++ b/paddle/phi/kernels/check_memory_continue_kernel.cc
@@ -88,7 +88,8 @@ PD_REGISTER_KERNEL(check_memory_continue,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(check_memory_continue,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index 8dcd3a1d995d8..58cacd21bba18 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -292,7 +292,7 @@ PD_REGISTER_KERNEL(coalesce_tensor,
 }
 #endif
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(coalesce_tensor,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
index 65ee3c1851003..81ed7170d7a24 100644
--- a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
@@ -64,7 +64,7 @@ struct GeluGradFunctor {
     } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
       auto x_data = x.data();
       auto dx_data = dx.data();
       auto dout_data = dout.data();
diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
index dbab3bd326664..47ab1a7839066 100644
--- a/paddle/phi/kernels/cpu/gelu_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -53,7 +53,7 @@ struct GeluFunctor {
     } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
       auto x_data = x.data();
       auto out_data = out.data();
       int n = std::min(x.size(), out.size());
diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc
index 17c24fa905b5c..638efeb4e3257 100644
--- a/paddle/phi/kernels/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/dist_grad_kernel.cc
@@ -97,7 +97,8 @@ void DistGradKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 #endif
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 8df5e9a543eb2..3bc8ad34ac951 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -74,7 +74,8 @@ PD_REGISTER_KERNEL(empty_like,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 42d137ba4f419..476cfc810acf8 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -46,7 +46,8 @@ PD_REGISTER_KERNEL(flatten_grad,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(flatten_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index dc61e6a650efa..0c6c9b3ec2d9a 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -75,7 +75,8 @@ PD_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(flatten_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
index 38beafbfa51b9..8817d577f7c8d 100644
--- a/paddle/phi/kernels/full_kernel.cc
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -49,7 +49,8 @@ PD_REGISTER_KERNEL(full_batch_size_like,
                    bool) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_batch_size_like,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 999625cf3dfb4..3a2b1f276bbbb 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -8,11 +8,20 @@ file(
   GLOB func_cc_srcs
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_MUSA)
   file(
     GLOB func_cu_srcs
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
     "*.cu")
 endif()
 
+# TODO(@MTAI): compilation error will occur when compiling the following files.
+# Compiler mcc need fix this bug.
+if(WITH_MUSA)
+  list(REMOVE_ITEM func_cu_srcs "cross_entropy.cu" "gru_compute.cu"
+       "softmax.cu")
+endif()
+
 collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 203f6837d4611..a43300056161b 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -2566,7 +2566,8 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 
 template <typename T>
 struct CudaLogitFunctor : public BaseActivationFunctor<T> {
diff --git a/paddle/phi/kernels/funcs/algorithm.h b/paddle/phi/kernels/funcs/algorithm.h
index 5f66f6f1abd4d..49daa32412674 100644
--- a/paddle/phi/kernels/funcs/algorithm.h
+++ b/paddle/phi/kernels/funcs/algorithm.h
@@ -40,7 +40,8 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
 
 template <typename T1, typename T2>
 HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  // @{ Group LowerBound
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || \
+    defined(__MUSACC__)  // @{ Group LowerBound
   // The following code is from
   // https://en.cppreference.com/w/cpp/algorithm/lower_bound
   auto *first = x;
@@ -63,7 +64,8 @@ HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) {
 
 template <typename T1, typename T2>
 HOSTDEVICE inline size_t UpperBound(const T1 *x, size_t num, const T2 &val) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  // @{ Group UpperBound
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || \
+    defined(__MUSACC__)  // @{ Group UpperBound
   // The following code is from
   // https://en.cppreference.com/w/cpp/algorithm/upper_bound
   auto *first = x;
diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
index 140eca890480f..b1e492d65b4a1 100644
--- a/paddle/phi/kernels/funcs/blas/blas.h
+++ b/paddle/phi/kernels/funcs/blas/blas.h
@@ -175,7 +175,8 @@ class Blas {
              T* c,
              const int* ldc) const;
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
   template <typename T>
   void MatMulWithHead(const phi::DenseTensor& mat_a,
                       const MatDescriptor& dim_a,
@@ -303,7 +304,7 @@ class Blas {
                    int batchCount) const;
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   template <typename T>
   void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
                            CBLAS_TRANSPOSE transB,
@@ -360,7 +361,8 @@ class Blas {
             T* B,
             int ldb) const;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   template <typename T>
   void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const;
 
@@ -445,7 +447,8 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template CSRMM<T>(args...);
   }
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
   template <typename... ARGS>
   void MatMulWithHead(ARGS... args) const {
     Base()->template MatMulWithHead<T>(args...);
@@ -543,7 +546,8 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template TRSM<T>(args...);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   template <typename... ARGS>
   void BatchedGETRF(ARGS... args) const {
     Base()->template BatchedGETRF<T>(args...);
@@ -593,3 +597,6 @@ inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h"
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/kernels/funcs/blas/blas_impl.mu.h"
+#endif
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index ffafe15b8fcf2..f570a48eeb5b7 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1452,7 +1452,8 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 }
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)  // @{ Group Blas MKLML: BatchedGEMMWithHead
+    !defined(PADDLE_WITH_HIP) &&                                \
+    !defined(PADDLE_WITH_MUSA)  // @{ Group Blas MKLML: BatchedGEMMWithHead
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
@@ -1698,7 +1699,7 @@ void Blas<DeviceContext>::MatMul(const T *mat_a,
 }
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 // @{ Group Blas MKLML: MatMulWithHead
 /*
  * Multiple two matrixes with multiple heads
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
new file mode 100644
index 0000000000000..44098d9d090c6
--- /dev/null
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
@@ -0,0 +1,357 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__MUSACC__)
+#include <thrust/device_vector.h>
+#endif
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/flags.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+PHI_DECLARE_bool(enable_cublas_tensor_op_math);
+PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
+
+namespace phi {
+namespace funcs {
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                 CBLAS_TRANSPOSE transB,
+                                 int M,
+                                 int N,
+                                 int K,
+                                 T alpha,
+                                 const T *A,
+                                 const T *B,
+                                 T beta,
+                                 T *C) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::float16 alpha,
+                                        const phi::dtype::float16 *A,
+                                        const phi::dtype::float16 *B,
+                                        phi::dtype::float16 beta,
+                                        phi::dtype::float16 *C) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        const phi::dtype::bfloat16 *B,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::complex<float> alpha,
+                                        const phi::dtype::complex<float> *A,
+                                        const phi::dtype::complex<float> *B,
+                                        phi::dtype::complex<float> beta,
+                                        phi::dtype::complex<float> *C) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::complex<double> alpha,
+                                        const phi::dtype::complex<double> *A,
+                                        const phi::dtype::complex<double> *B,
+                                        phi::dtype::complex<double> beta,
+                                        phi::dtype::complex<double> *C) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::GEMM(bool transA,
+                                 bool transB,
+                                 int M,
+                                 int N,
+                                 int K,
+                                 T alpha,
+                                 const T *A,
+                                 int lda,
+                                 const T *B,
+                                 int ldb,
+                                 T beta,
+                                 T *C,
+                                 int ldc) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::float16 alpha,
+                                        const phi::dtype::float16 *A,
+                                        int lda,
+                                        const phi::dtype::float16 *B,
+                                        int ldb,
+                                        phi::dtype::float16 beta,
+                                        phi::dtype::float16 *C,
+                                        int ldc) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        int lda,
+                                        const phi::dtype::bfloat16 *B,
+                                        int ldb,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C,
+                                        int ldc) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                 int M,
+                                 int N,
+                                 T alpha,
+                                 const T *A,
+                                 const T *B,
+                                 T beta,
+                                 T *C) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                        int M,
+                                        int N,
+                                        phi::dtype::float16 alpha,
+                                        const phi::dtype::float16 *A,
+                                        const phi::dtype::float16 *B,
+                                        phi::dtype::float16 beta,
+                                        phi::dtype::float16 *C) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                        int M,
+                                        int N,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        const phi::dtype::bfloat16 *B,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        T alpha,
+                                        const T *A,
+                                        const T *B,
+                                        T beta,
+                                        T *C,
+                                        int batchCount,
+                                        int64_t strideA,
+                                        int64_t strideB) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               phi::dtype::bfloat16 alpha,
+                                               const phi::dtype::bfloat16 *A,
+                                               const phi::dtype::bfloat16 *B,
+                                               phi::dtype::bfloat16 beta,
+                                               phi::dtype::bfloat16 *C,
+                                               int batchCount,
+                                               int64_t strideA,
+                                               int64_t strideB) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        T alpha,
+                                        const T **A,
+                                        const T **B,
+                                        T beta,
+                                        T **C,
+                                        int batchCount) const {}
+
+#if defined(__MUSACC__)
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               double alpha,
+                                               const double **A,
+                                               const double **B,
+                                               double beta,
+                                               double **C,
+                                               int batchCount) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               float alpha,
+                                               const float **A,
+                                               const float **B,
+                                               float beta,
+                                               float **C,
+                                               int batchCount) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               phi::dtype::float16 alpha,
+                                               const phi::dtype::float16 **A,
+                                               const phi::dtype::float16 **B,
+                                               phi::dtype::float16 beta,
+                                               phi::dtype::float16 **C,
+                                               int batchCount) const {}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               phi::dtype::bfloat16 alpha,
+                                               const phi::dtype::bfloat16 **A,
+                                               const phi::dtype::bfloat16 **B,
+                                               phi::dtype::bfloat16 beta,
+                                               phi::dtype::bfloat16 **C,
+                                               int batchCount) const {}
+#endif
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
+                                 CBLAS_UPLO uplo,
+                                 CBLAS_TRANSPOSE transA,
+                                 CBLAS_DIAG diag,
+                                 int M,
+                                 int N,
+                                 T alpha,
+                                 const T *A,
+                                 int lda,
+                                 T *B,
+                                 int ldb) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGETRF(
+    int n, T **a, int *ipiv, int *info, int batch_size) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGETRI(int n,
+                                         const T **a,
+                                         const int *ipiv,
+                                         T **a_inv,
+                                         int *info,
+                                         int batch_size) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedMatInv(
+    int n, const T **a, T **a_inv, int *info, int batch_size) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
+                                         int n,
+                                         int nrhs,
+                                         const T **a,
+                                         int lda,
+                                         int *ipiv,
+                                         T **b,
+                                         int ldb,
+                                         int *info,
+                                         int batch_size) const {}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
+                                        CBLAS_UPLO uplo,
+                                        CBLAS_TRANSPOSE transA,
+                                        CBLAS_DIAG diag,
+                                        int M,
+                                        int N,
+                                        T alpha,
+                                        const T **A,
+                                        int lda,
+                                        T **B,
+                                        int ldb,
+                                        int batch_size) const {}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index e754ce3bf49e4..b1732b44373c7 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -17,7 +17,8 @@ limitations under the License. */
 #include <sstream>
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 #include "paddle/phi/kernels/funcs/dims_simplifier.h"
 
 namespace kps = phi::kps;
@@ -27,7 +28,8 @@ namespace kps = phi::kps;
 namespace phi {
 namespace funcs {
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 
 enum BroadcastLoadType { kMixed = 1, kBroadcast = 2, kElementwise = 3 };
 
diff --git a/paddle/phi/kernels/funcs/check_numerics_utils.h b/paddle/phi/kernels/funcs/check_numerics_utils.h
index 473d7994058a8..7f618fa3b3f33 100644
--- a/paddle/phi/kernels/funcs/check_numerics_utils.h
+++ b/paddle/phi/kernels/funcs/check_numerics_utils.h
@@ -86,7 +86,7 @@ HOSTDEVICE static void PrintAndThrowError(const char* debug_info,
                                           int64_t num_nan,
                                           int64_t num_inf,
                                           int64_t num_zero) {
-#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__)
+#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__) && !defined(__MUSA_ARCH__)
   PADDLE_THROW(phi::errors::PreconditionNotMet(
       "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in "
       "%s.",
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 5a7574b56a891..3086d5dc4ed14 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -21,6 +21,10 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/funcs/segmented_array.h"
 
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/gpu/musa/musa_helper.h"
+#endif
+
 namespace phi {
 namespace funcs {
 
diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
index 0cd07fdfd0e1a..707b203e9f49b 100644
--- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
@@ -17,7 +17,8 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/device_context.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 
@@ -41,7 +42,8 @@ struct StridedMemcpyFunctor<T, 0> {
       auto& cpu_place = place;
       memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory_utils::Copy(
@@ -68,7 +70,8 @@ struct StridedMemcpyFunctor<T, 1> {
       memory_utils::Copy(
           cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory_utils::Copy(gpu_place,
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index a30fb79f8c8b0..f0235f0baec5f 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
@@ -109,7 +109,7 @@ DenseTensor Diagonal(const DeviceContext& context,
 
     int64_t pos = std::abs(offset) * offset_stride;
     int64_t dim_size = ret_strides.size();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     thrust::device_vector<int64_t> diag_vec(vectorize(dig_stride));
     const int64_t* diag_arr = thrust::raw_pointer_cast(diag_vec.data());
     thrust::device_vector<int64_t> ret_vec(ret_strides);
@@ -146,7 +146,7 @@ std::vector<T> ComputeDimStride(const std::vector<T> dim) {
   return dim_strides;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T, int X_DIM_SIZE, int OUT_DIM_SIZE>
 __global__ void DiagonalCuda(const T* data1,
                              T* data2,
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index abade7ac0ef87..4705370f71f7c 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -17,6 +17,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <curand_kernel.h>
 #endif
+#ifdef __MUSACC__
+#include <murand_kernel.h>
+#endif
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 #endif
@@ -28,7 +31,7 @@ limitations under the License. */
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
@@ -49,7 +52,7 @@ struct exponential_transform {
   explicit exponential_transform(T lambda) : lambda_(lambda) {}
 
   HOSTDEVICE inline T operator()(T val) const {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     T log = -std::numeric_limits<T>::epsilon() / 2;
     if (val < static_cast<T>(1.) - std::numeric_limits<T>::epsilon() / 2) {
       if (std::is_same<T, double>::value) {
@@ -113,7 +116,7 @@ struct normal_transform {
   T std_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 namespace kps = phi::kps;
 
@@ -186,6 +189,69 @@ struct normal_distribution<double> {
   static constexpr int kReturnsCount = 2;
 };
 
+#elif defined(__MUSACC__)
+template <typename T>
+struct uniform_distribution {
+  __device__ inline T operator()(murand_state_philox4x32_10 *state) const {
+    return static_cast<T>(murand_uniform(state));
+  }
+  static constexpr int kReturnsCount = 1;
+};
+
+template <>
+struct uniform_distribution<float> {
+  __device__ inline float4 operator()(murand_state_philox4x32_10 *state) const {
+    return murand_uniform4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<double> {
+  __device__ inline double2 operator()(
+      murand_state_philox4x32_10 *state) const {
+    return murand_uniform2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+template <>
+struct uniform_distribution<uint32_t> {
+  __device__ inline uint4 operator()(murand_state_philox4x32_10 *state) const {
+    return murand4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<uint64_t> {
+  __device__ inline ulonglong2 operator()(
+      murand_state_philox4x32_10 *state) const {
+    ulonglong2 result;
+    uint4 rand = murand4(state);
+    result.x = (uint64_t)rand.x << 32 | rand.y;
+    result.y = (uint64_t)rand.z << 32 | rand.w;
+    return result;
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+template <>
+struct normal_distribution<float> {
+  __device__ inline float4 operator()(murand_state_philox4x32_10 *state) const {
+    return murand_normal4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct normal_distribution<double> {
+  __device__ inline double2 operator()(
+      murand_state_philox4x32_10 *state) const {
+    return murand_normal2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
 #else
 template <typename T>
 struct uniform_distribution {
@@ -268,6 +334,10 @@ __global__ void DistributionKernel(size_t size,
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, offset, &state);
   using SType = curandStatePhilox4_32_10_t;
+#elif defined(__MUSACC__)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index a1fc2c225ecf2..d31ab7f3c1c12 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -24,6 +24,10 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <murand_kernel.h>
+#include <musa_runtime.h>
+#endif
 
 #include "paddle/phi/kernels/funcs/dropout_impl_util.h"
 
@@ -142,6 +146,10 @@ __global__ void VectorizedRandomGenerator(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -212,6 +220,10 @@ __global__ void VectorizedGeneratorMask(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -295,6 +307,11 @@ void DropoutFwGPUKernelDriver(
           hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
@@ -430,6 +447,8 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
     if (upscale_in_train && dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
       hipMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #else
       cudaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 274ac1cc32c05..08d59cc2569d4 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -22,7 +22,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/function_traits.h"
@@ -151,7 +152,7 @@ class MidWiseTransformIterator<T, CPUContext>
   int64_t post_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 class RowwiseTransformIterator<T, GPUContext>
     : public thrust::iterator_adaptor<RowwiseTransformIterator<T, GPUContext>,
@@ -486,7 +487,8 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 
 // static unroller
 template <template <int Index, int VecSize> typename Func,
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index b7994d9cefa51..325fe5b7b39ff 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -848,7 +848,7 @@ struct InverseFloorDivideFunctor<dtype::bfloat16> {
   }
 };
 
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T, typename MPType>
 inline HOSTDEVICE typename std::enable_if<std::is_integral<T>::value, T>::type
 compute_pow(const T a, const T b) {
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 49f593289a9f1..15af296d0d71f 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
@@ -405,7 +405,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
       dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 // Suppose only has contiguous dims
 static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
   for (int i = 1; i < broadcast_pos.size(); ++i) {
diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h
index 3ad0f22c8e912..6762a891e3dcd 100644
--- a/paddle/phi/kernels/funcs/embedding_grad.h
+++ b/paddle/phi/kernels/funcs/embedding_grad.h
@@ -96,9 +96,9 @@ __global__ void EmbeddingGradDeterministicKernel(T* table,
         unsigned long long int matchmask =      // NOLINT
             __ballot(match_found_this_thread);  // NOLINT
         int first_remaining_peer = __ffsll(matchmask) - 1;
-#else
-        // If and only if match_found_this_thread of the Nth thread is non-zero,
-        // set the Nth bit of matchmask to 1.
+#else  // MUSA and CUDA
+       // If and only if match_found_this_thread of the Nth thread is non-zero,
+       // set the Nth bit of matchmask to 1.
         unsigned int matchmask =
             __ballot_sync(0xffffffff, match_found_this_thread);
         // Find the position of the first bit set to 1 in matchmask.
@@ -112,7 +112,7 @@ __global__ void EmbeddingGradDeterministicKernel(T* table,
           while (matchmask) {
 #ifdef PADDLE_WITH_HIP
             first_remaining_peer = __ffsll(matchmask) - 1;
-#else
+#else  // CUDA and MUSA
             first_remaining_peer = __ffs(matchmask) - 1;
 #endif
             my_s[threadIdx.x] +=
@@ -142,7 +142,7 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& ctx,
 #ifdef PADDLE_WITH_HIP
   constexpr int kWarpSize = 64;
   constexpr int kBlockDimY = 16;
-#else
+#else  // CUDA and MUSA
   constexpr int kWarpSize = 32;
   constexpr int kBlockDimY = 32;
 #endif
diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu
index edac497bc8e8b..f9015840abe88 100644
--- a/paddle/phi/kernels/funcs/fft.cu
+++ b/paddle/phi/kernels/funcs/fft.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-
+#ifdef PADDLE_WITH_CUDA
 #include <cmath>
 
 #include "paddle/phi/kernels/funcs/fft.h"
@@ -104,7 +104,7 @@ inline bool use_cache(const int64_t* signal_size) {
   }
   return using_cache;
 }
-#elif defined(PADDLE_WITH_HIP)
+#elif defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 inline bool use_cache(const int64_t* signal_size) { return true; }
 #endif
 
@@ -200,6 +200,11 @@ void exec_fft(const phi::GPUContext& ctx,
       phi::dynload::hipfftSetStream(config->plan(), ctx.stream()));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mufftSetStream(config->plan(), ctx.stream()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mufftSetWorkArea(config->plan(), workspace_tensor.data()));
 #endif
 
   // execution of fft plan
@@ -344,3 +349,4 @@ template struct FFTR2CFunctor<phi::GPUContext, double, complex128_t>;
 
 }  // namespace funcs
 }  // namespace phi
+#endif
diff --git a/paddle/phi/kernels/funcs/fft_cache.h b/paddle/phi/kernels/funcs/fft_cache.h
index 51e90a6c0d95b..a6f775af88ea7 100644
--- a/paddle/phi/kernels/funcs/fft_cache.h
+++ b/paddle/phi/kernels/funcs/fft_cache.h
@@ -25,6 +25,8 @@
 #include "paddle/phi/kernels/funcs/cufft_util.h"
 #elif defined(PADDLE_WITH_HIP)
 #include "paddle/phi/kernels/funcs/hipfft_util.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/kernels/funcs/mufft_util.h"
 #endif
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/fft_fill_conj.h b/paddle/phi/kernels/funcs/fft_fill_conj.h
index 91d859020f88b..c57554cdd9971 100644
--- a/paddle/phi/kernels/funcs/fft_fill_conj.h
+++ b/paddle/phi/kernels/funcs/fft_fill_conj.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "thrust/device_vector.h"
 #endif
 
@@ -156,7 +156,7 @@ void FFTFillConj(const DeviceContext& ctx,
     _is_fft_axis[i] = true;
   }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   const thrust::device_vector<int64_t> src_strides_g(src_strides_v);
   const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data());
   const thrust::device_vector<int64_t> dst_strides_g(dst_strides_v);
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
index 9648a7d845ff0..c4726c5d99287 100644
--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -42,7 +42,7 @@ struct ForRange<phi::CPUContext> {
   size_t limit_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 template <typename Function>
 __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
diff --git a/paddle/phi/kernels/funcs/inclusive_scan.h b/paddle/phi/kernels/funcs/inclusive_scan.h
index 265febd306f33..5e4fced6ef155 100644
--- a/paddle/phi/kernels/funcs/inclusive_scan.h
+++ b/paddle/phi/kernels/funcs/inclusive_scan.h
@@ -17,6 +17,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/funcs/index_calculator.h b/paddle/phi/kernels/funcs/index_calculator.h
index 13697e443e16d..d6522bd05c0a8 100644
--- a/paddle/phi/kernels/funcs/index_calculator.h
+++ b/paddle/phi/kernels/funcs/index_calculator.h
@@ -15,7 +15,8 @@
 #pragma once
 
 // CUDA, XPU and HIP use same api
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h
index c135cb82e2ec3..fd6c511922026 100644
--- a/paddle/phi/kernels/funcs/index_put_utils.h
+++ b/paddle/phi/kernels/funcs/index_put_utils.h
@@ -26,10 +26,13 @@
 #include "paddle/phi/kernels/reshape_kernel.h"
 #include "paddle/phi/kernels/split_kernel.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #ifdef __NVCC__
 #include <cuda.h>
 #include <cuda_runtime.h>
+#elif defined(__MUSACC__)
+#include <musa.h>
+#include <musa_runtime.h>
 #elif defined(__HIPCC__)
 #include <hip/hip_runtime.h>
 #endif
@@ -291,7 +294,7 @@ static void CalCompressedDimsWith1AndWithout1(
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 __global__ void range_cuda_kernel(int64_t N, T* out) {
   int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h
index 23731285926da..af2b1cd4f44db 100644
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/primitive/datamover_primitives.h"
 #endif
 
@@ -153,7 +153,7 @@ inline std::vector<T> get_new_data_from_tensor(
   return vec_new_data;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 using phi::kps::details::FastDivMod;
 
 struct FastDivModForInterpolate {
diff --git a/paddle/phi/kernels/funcs/isfinite_functor.h b/paddle/phi/kernels/funcs/isfinite_functor.h
index d10e7998ba806..e96ec84dc5353 100644
--- a/paddle/phi/kernels/funcs/isfinite_functor.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -20,7 +20,7 @@ namespace funcs {
 template <typename T, class Enable = void>
 struct IsNanFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)
+#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__)
     return ::isnan(a);
 #else
     return std::isnan(a);
@@ -55,7 +55,7 @@ struct IsNanFunctor<phi::dtype::bfloat16, void> {
 template <typename T, class Enable = void>
 struct IsInfFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)
+#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__)
     return ::isinf(a);
 #else
     return std::isinf(a);
@@ -86,7 +86,7 @@ struct IsInfFunctor<phi::dtype::bfloat16, void> {
 template <typename T, class Enable = void>
 struct IsFiniteFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)
+#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__)
     return ::isfinite(a);
 #else
     return std::isfinite(a);
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 1d067b0fc2918..6f73551ab6766 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -17,6 +17,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -1350,8 +1353,8 @@ __global__ void LayerNormBackwardComputeGradInput(const T *__restrict__ dout,
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
-#else
-      // WARP_SHFL_XOR(sum_loss, mask);
+#else  // CUDA and MUSA
+       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, warpSize);
 #endif
@@ -1501,8 +1504,8 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize(
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
-#else
-      // WARP_SHFL_XOR(sum_loss, mask);
+#else  // CUDA and MUSA
+       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, WarpSize);
       sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, WarpSize);
 #endif
diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h
index 7f7b2be551a57..ba17748478fe7 100644
--- a/paddle/phi/kernels/funcs/layer_norm_util.h
+++ b/paddle/phi/kernels/funcs/layer_norm_util.h
@@ -36,7 +36,8 @@ struct RowwiseMean2D {
                   DenseTensor* vec);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 class RowwiseMean2D<phi::GPUContext, T> {
  public:
@@ -93,7 +94,8 @@ struct ColwiseSum2D {
                   DenseTensor* vec);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 class ColwiseSum2D<phi::GPUContext, T> {
  public:
diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
index 1a6cca7f11aae..d9fb6de531557 100644
--- a/paddle/phi/kernels/funcs/math_cuda_utils.h
+++ b/paddle/phi/kernels/funcs/math_cuda_utils.h
@@ -20,6 +20,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_fp16.h>
+#endif
 
 #include <algorithm>
 
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 10d18cc958ae3..bc365387bdb81 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -239,7 +239,8 @@ void set_constant(const phi::DeviceContext& context,
     return;
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // tensor->place().apply_visitor(func);
   phi::VisitPlace(tensor->place(), func);
 #elif defined(PADDLE_WITH_XPU)
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index b42714e80db2f..1fe2ffd9eb2e5 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -25,7 +25,8 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T>
 void BatchTranspose(T* output,
                     const T* input,
diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h
index 632b0ce7e1510..b241d341b7df1 100644
--- a/paddle/phi/kernels/funcs/mode.h
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
@@ -143,7 +143,7 @@ static void ModeAssign(const Type& input_height,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 static void GetModebySort(const phi::GPUContext& dev_ctx,
                           const DenseTensor* input_tensor,
diff --git a/paddle/phi/kernels/funcs/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h
index 80f37750adcf9..b433efdd95f4f 100644
--- a/paddle/phi/kernels/funcs/norm_utils.cu.h
+++ b/paddle/phi/kernels/funcs/norm_utils.cu.h
@@ -20,6 +20,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index bf2409d2e502b..034ba0378ca3b 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -23,7 +23,8 @@ limitations under the License. */
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/macros.h"  // import FLT_MAX
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #endif
 
@@ -115,7 +116,8 @@ HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename PoolProcess, typename T>
 class Pool2dDirectCUDAFunctor {
  public:
@@ -211,7 +213,8 @@ class MaxPool2dGradFunctor {
                   DenseTensor* input_grad);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename PoolProcess, typename T>
 class Pool3dDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/funcs/random.cuh b/paddle/phi/kernels/funcs/random.cuh
index 502b7e85ee97f..fad9caf2a1d14 100644
--- a/paddle/phi/kernels/funcs/random.cuh
+++ b/paddle/phi/kernels/funcs/random.cuh
@@ -18,6 +18,9 @@
 #ifdef __NVCC__
 #include <cuda_runtime_api.h>  // NOLINT
 #endif
+#ifdef __MUSACC__
+#include <musa_runtime_api.h>  // NOLINT
+#endif
 
 class RandomNumGen {
  public:
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 5e738d431dfa6..7b7a2dd28d2a4 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -15,7 +15,8 @@
 #pragma once
 
 // CUDA, XPU and HIP use same api
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
@@ -27,6 +28,10 @@
 #include "cub/cub.cuh"
 #endif
 
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
+
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -67,7 +72,8 @@ using dim3 = phi::kps::dim3;
 namespace phi {
 namespace funcs {
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 namespace details {
 
 // Check if reduce rand is valid
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 96b7942cf2709..a99f0af2b1148 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -15,10 +15,14 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index 2ca97cd4ac205..fb76ab3017179 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -117,7 +117,7 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
                                              context.template Alloc<T>(XGrad),
                                              MIOPEN_SOFTMAX_ACCURATE,
                                              MIOPEN_SOFTMAX_MODE_INSTANCE));
-#else
+#elif defined(PADDLE_WITH_MUSA)
   cudnnTensorDescriptor_t cudnn_y_desc =
       yDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_xgrad_desc =
diff --git a/paddle/phi/kernels/funcs/softmax.h b/paddle/phi/kernels/funcs/softmax.h
index 80805eb6d76f6..60fcaf57cf631 100644
--- a/paddle/phi/kernels/funcs/softmax.h
+++ b/paddle/phi/kernels/funcs/softmax.h
@@ -37,7 +37,8 @@ class SoftmaxGradFunctor {
                   phi::DenseTensor* x_grad);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T, typename DeviceContext>
 class SoftmaxCUDNNFunctor {
  public:
diff --git a/paddle/phi/kernels/funcs/sparse/softmax.cu.h b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
index 72f99bd6331c4..9ebcd7e195339 100644
--- a/paddle/phi/kernels/funcs/sparse/softmax.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
@@ -27,6 +27,8 @@ inline DenseTensor GetOffsets(const Context& dev_ctx,
                               const IntT dim) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
@@ -87,6 +89,8 @@ std::tuple<DenseTensor, DenseTensor, DenseTensor, DenseTensor> ComputePoolMax(
     const IntT dim) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/funcs/squared_l2_norm.h b/paddle/phi/kernels/funcs/squared_l2_norm.h
index c77552822bbfb..b997ef75d5ca5 100644
--- a/paddle/phi/kernels/funcs/squared_l2_norm.h
+++ b/paddle/phi/kernels/funcs/squared_l2_norm.h
@@ -18,10 +18,12 @@
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
+#elif defined(__MUSACC__)
+#include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -54,7 +56,7 @@ void SquaredL2Norm(const phi::CPUContext& ctx,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T1, typename T2 = T1>
 void SquaredL2Norm(const phi::GPUContext& ctx,
                    const T1* x,
diff --git a/paddle/phi/kernels/funcs/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h
index de38e40d317e1..6fa28b4b2ae7d 100644
--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -57,7 +57,7 @@ inline void CopyWithContext(const Context& ctx,
                             const void* src,
                             size_t num) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
 #else
   PADDLE_THROW(
diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
index b6d6b0cffc667..5745c29afa2b2 100644
--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -20,6 +20,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 #endif
@@ -1126,6 +1129,15 @@ bool SortTopk(const phi::GPUContext& ctx,
                  << hipGetErrorString(err);
       return false;
     }
+#elif defined(__MUSACC__)
+    if (err != musaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairsDescending to "
+                    "calculate "
+                    "temp_storage_bytes, status: "
+                 << musaGetErrorString(err);
+      return false;
+    }
 #else
     if (err != cudaSuccess) {
       LOG(ERROR)
@@ -1159,6 +1171,14 @@ bool SortTopk(const phi::GPUContext& ctx,
                  << hipGetErrorString(err);
       return false;
     }
+#elif defined(__MUSACC__)
+    if (err != musaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to calculate "
+                    "temp_storage_bytes, status: "
+                 << musaGetErrorString(err);
+      return false;
+    }
 #else
     if (err != cudaSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
@@ -1197,6 +1217,16 @@ bool SortTopk(const phi::GPUContext& ctx,
                  << ", status: " << hipGetErrorString(err);
       return false;
     }
+#elif defined(__MUSACC__)
+    if (err != musaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairsDescending to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << musaGetErrorString(err);
+      return false;
+    }
 #else
     if (err != cudaSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
@@ -1233,6 +1263,16 @@ bool SortTopk(const phi::GPUContext& ctx,
                  << ", status: " << hipGetErrorString(err);
       return false;
     }
+#elif defined(__MUSACC__)
+    if (err != musaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << musaGetErrorString(err);
+      return false;
+    }
 #else
     if (err != cudaSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
index a27eb5149308e..feaa3662a64a7 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
@@ -21,6 +21,10 @@
 #include "cub/cub.cuh"
 #endif
 
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
index 700141f1e0331..8ae3570e5d3c5 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
@@ -21,6 +21,10 @@
 #include "cub/cub.cuh"
 #endif
 
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index dce2f8e5247e7..a2f3819f0300c 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -105,6 +105,10 @@ __global__ void VectorizedDropoutBackward(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index 3cb1a6742543a..85dc7d31f2064 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -91,6 +91,10 @@ __global__ void VectorizedDropoutForward(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
index c1d60cbffee2f..f4db3137d0da8 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
@@ -22,14 +22,19 @@
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <murand_kernel.h>
+#include <musa_runtime.h>
+#endif
 
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #define WARP_SIZE 64
-#else
+#else  // MUSA & CUDA
 #define WARP_SIZE 32
 #endif
 
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index aa703ede3bad6..3eff633ff0c51 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -299,7 +299,7 @@ void HardSwishGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(relu_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 83e130f0a71bd..dd4e3bcfba233 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -186,7 +186,7 @@ PD_REGISTER_KERNEL(relu,
                    float,
                    double,
                    phi::dtype::float16) {}
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(relu,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
index 99ccfcd8667e6..13a65c6a64f8b 100644
--- a/paddle/phi/kernels/gpu/allclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -87,6 +87,8 @@ void AllCloseKernel(const Context& dev_ctx,
   grid = (grid > block) ? block : grid;
 #ifdef PADDLE_WITH_HIP
   hipMemset(out_data, true, sizeof(bool));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(out_data, true, sizeof(bool));
 #else
   cudaMemset(out_data, true, sizeof(bool));
 #endif
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index c42ad005c306c..7106f2e1230e4 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -17,11 +17,14 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
index b8d9df64c23ef..6ff4423ef6e9c 100644
--- a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -21,6 +21,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
index 5102594f98d1e..2d3516c952658 100644
--- a/paddle/phi/kernels/gpu/argsort_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -21,6 +21,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
index f733df24cf898..5fee7c93d8f05 100644
--- a/paddle/phi/kernels/gpu/auc_kernel.cu
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -231,6 +231,23 @@ void AucKernel(const Context &dev_ctx,
             sizeof(int64_t),
         cudaMemcpyDeviceToDevice);
   }
+#elif defined(PADDLE_WITH_MUSA)
+  if (stat_pos_in_tensor != stat_pos_out) {
+    musaMemcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        musaMemcpyDeviceToDevice);
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    musaMemcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        musaMemcpyDeviceToDevice);
+  }
 #else
   if (stat_pos_in_tensor != stat_pos_out) {
     hipMemcpy(
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index b940374556009..dbdeb6f02663d 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -650,7 +650,7 @@ void BatchNormGradRawKernel(const Context &ctx,
 //     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_;
@@ -669,6 +669,8 @@ void BatchNormGradRawKernel(const Context &ctx,
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // mode_ = miopenBNSpatial;
+#elif defined(PADDLE_WITH_MUSA)
+
 #elif CUDNN_VERSION_MIN(7, 0, 1)
     if (FLAGS_cudnn_batchnorm_spatial_persistent) {
       mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -694,7 +696,7 @@ void BatchNormGradRawKernel(const Context &ctx,
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
 //                                                       data_desc_, mode_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
         data_desc_,
         CudnnDataType<T>::type,
@@ -777,7 +779,7 @@ void BatchNormGradRawKernel(const Context &ctx,
 //         d_bias->template mutable_data<BatchNormParamType<T>>(
 //             ctx.GetPlace()),
 //         epsilon, saved_mean_data, saved_var_data));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     }
     // CUDNN only support small batch size
     bool use_native_nhwc =
@@ -1113,7 +1115,7 @@ void BatchNormGradRawKernel(const Context &ctx,
 //     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     // clean when exit.
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
@@ -1373,7 +1375,7 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
                    phi::BatchNormGradRawKernel,
                    float,
                    phi::dtype::float16) {}
-#else
+#elif defined(PADDLE_WITH_CUDA)
 #if CUDNN_VERSION_MIN(8, 1, 0)
 
 PD_REGISTER_KERNEL(batch_norm_grad,
@@ -1407,7 +1409,7 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
 }
-#else
+#else  // CUDA
 PD_REGISTER_KERNEL(batch_norm_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -1445,7 +1447,7 @@ PD_REGISTER_KERNEL(batch_norm_double_grad,
                    phi::BatchNormDoubleGradKernel,
                    float,
                    double) {}
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(batch_norm_double_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 1a07e5f0d4909..7b4c24d9f7e61 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -15,6 +15,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -593,6 +596,8 @@ void BatchNormKernel(const Context &ctx,
 //     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t bn_param_desc_;
@@ -614,6 +619,8 @@ void BatchNormKernel(const Context &ctx,
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // mode_ = miopenBNSpatial;
+#elif defined(PADDLE_WITH_MUSA)
+
 #elif CUDNN_VERSION_MIN(7, 0, 1)
   if (FLAGS_cudnn_batchnorm_spatial_persistent) {
     mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -641,7 +648,7 @@ void BatchNormKernel(const Context &ctx,
     strides = {H * W * D * C, 1, W * D * C, D * C, C};
   }
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
 //     data_desc_, CudnnDataType<T>::type,
@@ -942,8 +949,8 @@ void BatchNormKernel(const Context &ctx,
 //                 ctx.GetPlace())),
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
-#else
-      // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
+#else  // CUDA & MUSA
+       // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
       const bool use_native_kernel =
           ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
            (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD_TRAIN));
@@ -1206,7 +1213,7 @@ void BatchNormKernel(const Context &ctx,
 //     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
 // PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   // clean when exit.
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
@@ -1256,7 +1263,7 @@ PD_REGISTER_KERNEL(batch_norm,
     kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
   }
 }
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(batch_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 60e7b90e80135..bad2f1737b375 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -17,6 +17,9 @@
 #ifdef __NVCC__
 #include <curand_kernel.h>
 #endif
+#ifdef __MUSACC__
+#include <murand_kernel.h>
+#endif
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 #endif
@@ -43,9 +46,12 @@ __global__ void bernoulli_cuda_kernel(
 #if defined(__NVCC__)
   curandStatePhilox4_32_10_t state;
   curand_init(seed, thread_idx, offset, &state);
-#else
+#elif defined(__HIPCC__)
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, thread_idx, offset, &state);
+#elif defined(__MUSACC__)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, thread_idx, offset, &state);
 #endif
 
   size_t total_thread = gridDim.x * blockDim.x;
diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
index 4b516b1074ba5..c69e3d13ff684 100644
--- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu
+++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -399,6 +399,12 @@ static char* GetGpuHintStringPtr(const phi::GPUContext& ctx,
                                                 op_var.length() + 1,
                                                 hipMemcpyHostToDevice,
                                                 ctx.stream()));
+#elif defined(__MUSACC__)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(gpu_str_ptr,
+                                                 iter->first.c_str(),
+                                                 op_var.length() + 1,
+                                                 musaMemcpyHostToDevice,
+                                                 ctx.stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(gpu_str_ptr,
                                                  iter->first.c_str(),
@@ -494,6 +500,11 @@ void CheckNumericsKernel(const Context& ctx,
                          const std::string& output_dir,
                          DenseTensor* stats,
                          DenseTensor* values) {
+#ifdef PADDLE_WITH_MUSA
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "OP check_numerics is unsupported for MUSA backend now!"));
+  return;
+#else
   int dev_id = tensor.place().device;
   VLOG(6) << "op_type=" << op_type << ", var_name=" << var_name
           << ", dev_id=gpu:" << dev_id << ", numel=" << tensor.numel()
@@ -592,6 +603,7 @@ void CheckNumericsKernel(const Context& ctx,
     PrintStack<T>(ctx, *stats, op_type, var_name, dev_id);
   }
 #endif
+#endif
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
index 16e854c8de4c6..daac0f0ac586e 100644
--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
 
 #include "paddle/phi/kernels/cholesky_kernel.h"
 
@@ -222,4 +222,4 @@ PD_REGISTER_KERNEL(cholesky,  // cuda_only
                    float,
                    double) {}
 
-#endif  // not PADDLE_WITH_HIP
+#endif  // not PADDLE_WITH_HIP && not PADDLE_WITH_MUSA
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
index f350106f67cf8..7e0974f428c6a 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
 
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -137,4 +137,4 @@ PD_REGISTER_KERNEL(cholesky_solve,  // cuda_only
                    float,
                    double) {}
 
-#endif  // not PADDLE_WITH_HIP
+#endif  // not PADDLE_WITH_HIP && not PADDLE_WITH_MUSA
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index 21dedeb94a62c..a1bd5b6fe619c 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -17,6 +17,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -289,7 +292,15 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    float,
                    double,
                    phi::dtype::float16) {}
-#else
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CrossEntropyWithSoftmaxGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#else  // CUDA
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index f8964f4ec5312..37e85d9df2266 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -19,6 +19,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -763,6 +766,8 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
     GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
+#elif defined(PADDLE_WITH_MUSA)
+    mudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #else
     cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #endif
@@ -782,12 +787,20 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
         softmax_data,
         MIOPEN_SOFTMAX_LOG,
         mode));
+#else
+#ifdef PADDLE_WITH_MUSA
+    auto mode = axis == rank - 1 ? MUDNN_SOFTMAX_MODE_INSTANCE
+                                 : MUDNN_SOFTMAX_MODE_CHANNEL;
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSoftmaxForward(
+        handle,
+        MUDNN_SOFTMAX_LOG,
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward(
         handle,
         CUDNN_SOFTMAX_LOG,
+#endif
         mode,
         phi::backends::gpu::CudnnDataType<T>::kOne(),
         descp,
@@ -1199,6 +1212,8 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
     GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
+#elif defined(PADDLE_WITH_MUSA)
+    mudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #else
     cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #endif
@@ -1465,6 +1480,13 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(cross_entropy_with_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CrossEntropyWithSoftmaxKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(cross_entropy_with_softmax,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h
index e5fc51849454d..033efe0b9e7b5 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h
@@ -26,6 +26,9 @@
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/gpu/miopen_lstm_cache.h"
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/kernels/gpu/mudnn_lstm_cache.h"
+#endif
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/cum_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
index 620d185475ef9..1d19162cb146a 100644
--- a/paddle/phi/kernels/gpu/cum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
@@ -22,6 +22,9 @@
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -63,7 +66,7 @@ void CumsumGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(cumsum_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
index eb70b98f7ddca..a9896eba9113b 100644
--- a/paddle/phi/kernels/gpu/cum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -21,6 +21,9 @@
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -228,6 +231,8 @@ ThrustCumsumKernel(const Context& dev_ctx,
                    bool exclusive) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
index fdd9b4ba49914..c58baf23cac9c 100644
--- a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -181,6 +181,8 @@ void CumprodGradKernel(const Context &dev_ctx,
 // Step 1: find cummax-ed zero mask of x
 #ifdef PADDLE_WITH_CUDA
   const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+  const auto &exec_policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
index 0b5a10b93d85a..fbaec83463951 100644
--- a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
+++ b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) && \
+    !defined(PADDLE_WITH_MUSA)
 
 #include "paddle/phi/kernels/decode_jpeg_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index 751a829d74634..51ba7a80df4ce 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -21,6 +21,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/dirichlet_kernel.cu b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
index 09d6a402e701a..36c9c3621bb72 100644
--- a/paddle/phi/kernels/gpu/dirichlet_kernel.cu
+++ b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
@@ -25,6 +25,8 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include <curand_kernel.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <murand_kernel.h>
 #endif
 #ifdef PADDLE_WITH_HIP
 #include <hiprand_kernel.h>
@@ -40,6 +42,11 @@ using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t;
 #define COMPAT_RAND_INIT hiprand_init
 #define COMPAT_RAND_UNIFORM hiprand_uniform
 #define COMPAT_RAND_NORMAL hiprand_normal
+#elif defined(PADDLE_WITH_MUSA)
+using COMPAT_RANDSTATEPHILOX4_32_10_T = murand_state_philox4x32_10;
+#define COMPAT_RAND_INIT murand_init
+#define COMPAT_RAND_UNIFORM murand_uniform
+#define COMPAT_RAND_NORMAL murand_normal
 #endif
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
index 092d2428640c8..c3b9c0792963d 100644
--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -15,6 +15,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index 99ba12b1d6213..5fdf63083896e 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -99,6 +99,9 @@ struct EmbeddingGradCUDAFunctor {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
index 38e0e27d99f14..d10de5579cd03 100644
--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -19,6 +19,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index ac0dea5165379..6a9fe32273827 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -23,6 +23,9 @@
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -61,7 +64,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024;
+  int block = 1024;  // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_input + block - 1) / block;
@@ -76,6 +79,8 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
   int* item_count_ptr = reinterpret_cast<int*>(item_count->ptr());
 #ifdef PADDLE_WITH_HIP
   hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
 #else
   cudaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
 #endif
@@ -97,6 +102,11 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
             item_count_ptr + num_input,
             sizeof(int),
             hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(&total_unique_items,
+             item_count_ptr + num_input,
+             sizeof(int),
+             musaMemcpyDeviceToHost);
 #else
   cudaMemcpy(&total_unique_items,
              item_count_ptr + num_input,
@@ -131,7 +141,7 @@ void FillBufferHashTable(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024;
+  int block = 1024;  // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_input + block - 1) / block;
@@ -170,7 +180,7 @@ void ResetBufferHashTable(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024;
+  int block = 1024;  // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (unique_items->size() + block - 1) / block;
@@ -193,7 +203,7 @@ void ReindexSrc(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024;
+  int block = 1024;  // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_edges + block - 1) / block;
@@ -293,7 +303,7 @@ void BufferReindex(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   int block = 256;
 #else
-  int block = 1024;
+  int block = 1024;  // CUDA & MUSA
 #endif
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_edges + block - 1) / block;
@@ -364,6 +374,11 @@ void ReindexDst(const Context& dev_ctx,
               thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
               sizeof(int),
               hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemcpy(&count_i,
+               thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
+               sizeof(int),
+               musaMemcpyDeviceToHost);
 #else
     cudaMemcpy(&count_i,
                thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index c01a8ea9d2e01..738bfcabb6080 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -22,6 +22,9 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <murand_kernel.h>
+#include <musa_runtime.h>
 #else
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
@@ -82,6 +85,12 @@ __global__ void SampleKernel(const uint64_t rand_seed,
                threadIdx.y * CTA_SIZE + threadIdx.x,
                0,
                &rng);
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 rng;
+  murand_init(rand_seed * gridDim.x + blockIdx.x,
+              threadIdx.y * CTA_SIZE + threadIdx.x,
+              0,
+              &rng);
 #else
   curandStatePhilox4_32_10_t rng;
   curand_init(rand_seed * gridDim.x + blockIdx.x,
@@ -118,6 +127,8 @@ __global__ void SampleKernel(const uint64_t rand_seed,
       for (int idx = k + threadIdx.x; idx < deg; idx += CTA_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
+#elif defined(PADDLE_WITH_MUSA)
+        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
@@ -218,6 +229,10 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
   hiprandState rng;
   hiprand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 rng;
+  murand_init(
+      rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #else
   curandStatePhilox4_32_10_t rng;
   curand_init(
@@ -242,6 +257,8 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
       for (int idx = split + threadIdx.x; idx <= deg - 1; idx += CTA_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
+#elif defined(PADDLE_WITH_MUSA)
+        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
index bff91078865d9..3c2e4fa856a6a 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
@@ -42,6 +42,15 @@ inline void CopyBCastOff(const BroadCastInfo& bcast_info,
             bcast_info.r_offset.data(),
             sizeof(int64_t) * bcast_info.out_len,
             hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()),
+             bcast_info.l_offset.data(),
+             sizeof(int64_t) * bcast_info.out_len,
+             musaMemcpyHostToDevice);
+  musaMemcpy(thrust::raw_pointer_cast(r_bcastoff->data()),
+             bcast_info.r_offset.data(),
+             sizeof(int64_t) * bcast_info.out_len,
+             musaMemcpyHostToDevice);
 #else
   cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()),
              bcast_info.l_offset.data(),
diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu
index ef39abd939410..5b0dda3030cf1 100644
--- a/paddle/phi/kernels/gpu/group_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu
@@ -300,6 +300,9 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
 #ifdef PADDLE_WITH_HIP
     hipMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
     hipMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
+    musaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
 #else
     cudaMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
     cudaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h
index 3ea5f3bc1088d..38ea2dbe64a07 100644
--- a/paddle/phi/kernels/gpu/group_norm_utils.h
+++ b/paddle/phi/kernels/gpu/group_norm_utils.h
@@ -17,6 +17,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index aee591894cc81..eb7911c14ad7e 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -18,10 +18,13 @@
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index 0f17a1bcc318a..c6a9759901470 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -401,7 +401,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
       phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t in_param_desc_;
 
@@ -427,7 +427,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
       const_cast<int *>(strides.data())));
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, miopenBNSpatial));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
       CudnnDataType<T>::type,
@@ -464,7 +464,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
         epsilon,
         saved_mean_data,
         saved_var_data));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward(
         dev_ctx.cudnn_handle(),
         CUDNN_BATCHNORM_SPATIAL,
@@ -511,7 +511,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
       phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
@@ -642,6 +642,21 @@ PD_REGISTER_KERNEL(instance_norm_double_grad,
                    phi::InstanceNormDoubleGradKernel,
                    float,
                    phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(instance_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
 #elif CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(instance_norm_grad,
                    GPU,
@@ -659,7 +674,7 @@ PD_REGISTER_KERNEL(instance_norm_double_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else
+#else  // CUDA
 PD_REGISTER_KERNEL(instance_norm_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index 7f10eac67c67c..783429a5751a5 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -69,6 +69,8 @@ void InstanceNormKernel(const Context &dev_ctx,
       phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t in_param_desc_;
@@ -100,6 +102,8 @@ void InstanceNormKernel(const Context &dev_ctx,
       const_cast<int *>(strides.data())));
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, miopenBNSpatial));
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
@@ -196,7 +200,7 @@ void InstanceNormKernel(const Context &dev_ctx,
       phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cudnnBatchNormalizationForwardTraining(
           handle,
@@ -234,6 +238,14 @@ PD_REGISTER_KERNEL(instance_norm,
                    phi::InstanceNormKernel,
                    float,
                    phi::dtype::float16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(instance_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
 #elif CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(instance_norm,
                    GPU,
@@ -243,7 +255,7 @@ PD_REGISTER_KERNEL(instance_norm,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else
+#else  // CUDA
 PD_REGISTER_KERNEL(instance_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h
index 865ab91da7b1b..feee37677e934 100644
--- a/paddle/phi/kernels/gpu/instance_norm_utils.h
+++ b/paddle/phi/kernels/gpu/instance_norm_utils.h
@@ -21,6 +21,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
index 2ecec80c27b24..781ce67654320 100644
--- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -93,6 +93,13 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
                << hipGetErrorString(err);
     return false;
   }
+#elif defined(__MUSACC__)
+  if (err != musaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << musaGetErrorString(err);
+    return false;
+  }
 #else
   if (err != cudaSuccess) {
     LOG(ERROR) << "KthvalueOP failed as could not launch "
@@ -125,6 +132,13 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
                << temp_storage_bytes << ", status: " << hipGetErrorString(err);
     return false;
   }
+#elif defined(__MUSACC__)
+  if (err != musaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << musaGetErrorString(err);
+    return false;
+  }
 #else
   if (err != cudaSuccess) {
     LOG(ERROR) << "KthvalueOP failed as could not launch "
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
index e8fc640cdd508..a199f529082f1 100644
--- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -137,7 +137,7 @@ PD_REGISTER_KERNEL(layer_norm_grad,
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
   }
 }
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(layer_norm_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 34425d8cfcfe2..a54c15823d22d 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -482,7 +482,8 @@ void LayerNormDirectCUDAFunctor<T, U>::operator()(gpuStream_t stream,
 
 template class LayerNormDirectCUDAFunctor<float, float>;
 template class LayerNormDirectCUDAFunctor<double, double>;
-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)) && \
+    !defined(PADDLE_WITH_HIP)
 template class LayerNormDirectCUDAFunctor<half, float>;
 #endif
 
@@ -689,7 +690,7 @@ PD_REGISTER_KERNEL(layer_norm,
   kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
 }
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(layer_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/llm_int8_mat_mul_kernel.cu b/paddle/phi/kernels/gpu/llm_int8_mat_mul_kernel.cu
index 47aca6c4cafef..9f60e976eeea0 100644
--- a/paddle/phi/kernels/gpu/llm_int8_mat_mul_kernel.cu
+++ b/paddle/phi/kernels/gpu/llm_int8_mat_mul_kernel.cu
@@ -16,7 +16,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/kernels/impl/llm_int8_mat_mul_kernel_impl.h"
 #endif
 
diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
index f6a5b26960a62..1b0bfaea403c3 100644
--- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -48,6 +48,13 @@ PD_REGISTER_KERNEL(log_softmax_grad,
                    float,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   phi::dtype::float16) {}
 #else
 PD_REGISTER_KERNEL(log_softmax_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
index 6dfe3d2b6173d..7b6ffe2d0cfd4 100644
--- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -47,6 +47,13 @@ PD_REGISTER_KERNEL(log_softmax,
                    float,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+#elif defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(log_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxKernel,
+                   float,
+                   phi::dtype::float16) {}
 #else
 PD_REGISTER_KERNEL(log_softmax,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu b/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu
index 4f4ee36892d62..f02f47edc4e28 100644
--- a/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu
@@ -20,7 +20,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/logcumsumexp_grad_impl.h"
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(logcumsumexp_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/logsumexp_function.cu.h b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
index 53b6fb6d2b20d..79eb0d2ceff01 100644
--- a/paddle/phi/kernels/gpu/logsumexp_function.cu.h
+++ b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
@@ -46,7 +46,7 @@ __inline__ __device__ T WarpAllReduce(T val) {
   for (int mask = ThreadGroupWidth / 2; mask > 0; mask /= 2) {
 #if PADDLE_WITH_HIP
     val = Functor<T>()(val, __shfl_xor(0xffffffff, val, mask));
-#else
+#else  // CUDA & MUSA
     val = Functor<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
 #endif
   }
@@ -69,6 +69,22 @@ inline void GetNumBlocks(int64_t block_size,
   *num_blocks = std::max<int>(
       1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
 }
+#elif defined(PADDLE_WITH_MUSA)
+inline void GetNumBlocks(int64_t block_size,
+                         int64_t max_blocks,
+                         int64_t waves,
+                         int* num_blocks) {
+  int dev;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&dev));
+  int sm_count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetAttribute(&sm_count, musaDevAttrMultiProcessorCount, dev));
+  int tpm;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
+      &tpm, musaDevAttrMaxThreadsPerMultiProcessor, dev));
+  *num_blocks = std::max<int>(
+      1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
+}
 #else
 inline void GetNumBlocks(int64_t block_size,
                          int64_t max_blocks,
@@ -193,6 +209,12 @@ inline hipError_t LaunchLogsumexpWarp(const Context& dev_ctx,
                                       const int64_t num_col,
                                       const SourceType* in,
                                       SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+inline musaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
+                                       const int64_t num_row,
+                                       const int64_t num_col,
+                                       const SourceType* in,
+                                       SourceType* out) {
 #else
 inline cudaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
                                        const int64_t num_row,
@@ -222,6 +244,8 @@ inline cudaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
           dev_ctx, num_row, num_col, in, out);
 #if PADDLE_WITH_HIP
   return hipPeekAtLastError();
+#elif defined(PADDLE_WITH_MUSA)
+  return musaPeekAtLastError();
 #else
   return cudaPeekAtLastError();
 #endif
@@ -240,6 +264,12 @@ inline hipError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
                                                    const int64_t num_col,
                                                    const SourceType* in,
                                                    SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+inline musaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
+                                                    const int64_t num_row,
+                                                    const int64_t num_col,
+                                                    const SourceType* in,
+                                                    SourceType* out) {
 #else
 inline cudaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
                                                     const int64_t num_row,
@@ -271,6 +301,13 @@ inline cudaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
 template <typename T, typename SourceType, typename Context, int VecSize>
 #if PADDLE_WITH_HIP
 typename std::enable_if<VecSize == 1, hipError_t>::type
+DispatchLogsumexpWarpCols(const Context& dev_ctx,
+                          const int64_t num_row,
+                          const int64_t num_col,
+                          const SourceType* in,
+                          SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+typename std::enable_if<VecSize == 1, musaError_t>::type
 DispatchLogsumexpWarpCols(const Context& dev_ctx,
                           const int64_t num_row,
                           const int64_t num_col,
@@ -287,6 +324,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
   if (num_col <= 0) {
 #if PADDLE_WITH_HIP
     return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+    return musaErrorInvalidValue;
 #else
     return cudaErrorInvalidValue;
 #endif
@@ -367,6 +406,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
 #undef HANDLE_COL
 #if PADDLE_WITH_HIP
   return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+  return musaErrorInvalidValue;
 #else
   return cudaErrorInvalidValue;
 #endif
@@ -375,6 +416,13 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
 template <typename T, typename SourceType, typename Context, int VecSize>
 #if PADDLE_WITH_HIP
 typename std::enable_if<VecSize == 2, hipError_t>::type
+DispatchLogsumexpWarpCols(const Context& dev_ctx,
+                          const int64_t num_row,
+                          const int64_t num_col,
+                          const SourceType* in,
+                          SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+typename std::enable_if<VecSize == 2, musaError_t>::type
 DispatchLogsumexpWarpCols(const Context& dev_ctx,
                           const int64_t num_row,
                           const int64_t num_col,
@@ -391,6 +439,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
   if (num_col <= 0) {
 #if PADDLE_WITH_HIP
     return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+    return musaErrorInvalidValue;
 #else
     return cudaErrorInvalidValue;
 #endif
@@ -455,6 +505,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
 #undef HANDLE_COL
 #if PADDLE_WITH_HIP
   return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+  return musaErrorInvalidValue;
 #else
   return cudaErrorInvalidValue;
 #endif
@@ -467,6 +519,12 @@ inline hipError_t DispatchLogsumexpWarp(const Context& dev_ctx,
                                         const int64_t num_col,
                                         const SourceType* in,
                                         SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+inline musaError_t DispatchLogsumexpWarp(const Context& dev_ctx,
+                                         const int64_t num_row,
+                                         const int64_t num_col,
+                                         const SourceType* in,
+                                         SourceType* out) {
 #else
 inline cudaError_t DispatchLogsumexpWarp(const Context& dev_ctx,
                                          const int64_t num_row,
diff --git a/paddle/phi/kernels/gpu/lu_kernel.cu b/paddle/phi/kernels/gpu/lu_kernel.cu
index d26826eccd156..c40a2616706b4 100644
--- a/paddle/phi/kernels/gpu/lu_kernel.cu
+++ b/paddle/phi/kernels/gpu/lu_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
 
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index e4ee1f342131a..f8a52d0389b6a 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
 
 #include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
index 531e30a880a48..7199018c5696d 100644
--- a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 
 #include "paddle/phi/kernels/multiclass_nms3_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index effc963cd0a3d..6ee5a95be8363 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 // To-do(qili93): fix this after issue resolved
 // https://github.com/ROCmSoftwarePlatform/rocPRIM/issues/202
 
@@ -21,6 +21,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
index 7895983236f91..4e5a2942d6b3b 100644
--- a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -36,6 +36,8 @@ void NllLossGradKernel(const Context& dev_ctx,
   auto total_weight_data = total_weight.data<T>();
 #ifdef PADDLE_WITH_HIP
   hipMemset(dx_data, 0, dx->numel() * sizeof(T));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(dx_data, 0, dx->numel() * sizeof(T));
 #else
   cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
index 1e80eb9bb460e..5d9aec594089d 100644
--- a/paddle/phi/kernels/gpu/nll_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -37,6 +37,8 @@ void NllLossRawKernel(const Context& dev_ctx,
   auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
 #ifdef PADDLE_WITH_HIP
   hipMemset(total_weight_data, 0, sizeof(T));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(total_weight_data, 0, sizeof(T));
 #else
   cudaMemset(total_weight_data, 0, sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/gpu/nonzero_kernel.cu b/paddle/phi/kernels/gpu/nonzero_kernel.cu
index bc44f4f033c45..afb460bb97c42 100644
--- a/paddle/phi/kernels/gpu/nonzero_kernel.cu
+++ b/paddle/phi/kernels/gpu/nonzero_kernel.cu
@@ -15,6 +15,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index ac29541326ec9..8d80365562a31 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -18,6 +18,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu
index a933de0bffac3..ca1f6690f7c2e 100644
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -18,6 +18,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu
index 302a9fe5ce581..03690d026aae7 100644
--- a/paddle/phi/kernels/gpu/poisson_kernel.cu
+++ b/paddle/phi/kernels/gpu/poisson_kernel.cu
@@ -15,6 +15,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <curand_kernel.h>
 #endif
+#ifdef __MUSACC__
+#include <murand_kernel.h>
+#endif
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 #endif
diff --git a/paddle/phi/kernels/gpu/qr_kernel.cu b/paddle/phi/kernels/gpu/qr_kernel.cu
index 14f602cc95bd6..4c0c521002c75 100644
--- a/paddle/phi/kernels/gpu/qr_kernel.cu
+++ b/paddle/phi/kernels/gpu/qr_kernel.cu
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP  // HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
 
 #include <thrust/device_vector.h>
 #include <algorithm>
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 2ad512701e097..3c45fad3137ea 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -17,6 +17,10 @@
 #ifdef __NVCC__
 #include <curand_kernel.h>
 
+#include "cub/cub.cuh"
+#endif
+#ifdef __MUSACC__
+#include <murand_kernel.h>
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -71,6 +75,11 @@ __global__ void SwapRepeatKernel(keyT* key_out_data,
   curand_init(seed, idx, offset, &state);
   for (int i = repeat_size - 1; i > 0; i--) {
     uint32_t r = curand(&state) % (i + 1);
+#elif defined(__MUSACC__)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx, offset, &state);
+  for (int i = repeat_size - 1; i > 0; i--) {
+    uint32_t r = murand(&state) % (i + 1);
 #elif __HIPCC__
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx, offset, &state);
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index cc3cad38f46fb..e44f4e9fb3e28 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -16,7 +16,7 @@
 
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU_KP)
+    defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_XPU_KP)
 
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index 7e01c1ae84391..347cfe316b53f 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -15,7 +15,8 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index fc27258981d39..f705fdc95a50c 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -25,7 +25,7 @@ namespace phi {
 using gpuRNNMode_t = miopenRNNMode_t;
 using gpuDnnHandle_t = miopenHandle_t;
 using gpuDnnDataType_t = miopenDataType_t;
-#else
+#elif defined(PADDLE_WITH_CUDA)
 using gpuRNNMode_t = cudnnRNNMode_t;
 using gpuDnnHandle_t = cudnnHandle_t;
 using gpuDnnDataType_t = cudnnDataType_t;
diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
index 910c8e8b6a57a..5bd1af8f789e5 100644
--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -105,7 +105,7 @@ void RnnGradKernel(const Context &dev_ctx,
     rnn_mode = miopenRNNRELU;
   else if (mode == "RNN_TANH")
     rnn_mode = miopenRNNTANH;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnRNNMode_t rnn_mode = CUDNN_LSTM;
   if (mode == "LSTM")
     rnn_mode = CUDNN_LSTM;
@@ -195,7 +195,7 @@ void RnnGradKernel(const Context &dev_ctx,
   T *init_c_grad_data = nullptr;
 #ifdef PADDLE_WITH_HIP
   if (rnn_mode == miopenLSTM) {
-#else
+#elif defined(PADDLE_WITH_CUDA)
   if (rnn_mode == CUDNN_LSTM) {
 #endif
     init_c_data = pre_state[1]->data<T>();
@@ -341,7 +341,7 @@ void RnnGradKernel(const Context &dev_ctx,
       // permute weight grad list from weight grad tensor
       TensorToPermutedWeight<T>(
           place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec);
-#else
+#elif defined(PADDLE_WITH_CUDA)
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
           handle,
           rnn.rnn_desc(),
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index c1ed3f16e0584..ebb748de9fcb2 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -63,7 +63,7 @@ void RNNInferece(bool has_seq_length,
                                                 last_c_data,
                                                 workspace_data->data<uint8_t>(),
                                                 workspace_size));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnRNNForwardInference(handle,
                                                rnn->rnn_desc(),
@@ -154,7 +154,7 @@ void RnnKernel(const Context &dev_ctx,
     rnn_mode = miopenRNNRELU;
   else if (mode == "RNN_TANH")
     rnn_mode = miopenRNNTANH;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   gpuRNNMode_t rnn_mode = CUDNN_LSTM;
   if (mode == "LSTM")
     rnn_mode = CUDNN_LSTM;
@@ -188,7 +188,7 @@ void RnnKernel(const Context &dev_ctx,
   T *last_c_data = nullptr;
 #ifdef PADDLE_WITH_HIP
   if (rnn_mode == miopenLSTM) {
-#else
+#elif defined(PADDLE_WITH_CUDA)
   if (rnn_mode == CUDNN_LSTM) {
 #endif
     init_c_data = pre_state[1]->data<T>();
@@ -331,7 +331,7 @@ void RnnKernel(const Context &dev_ctx,
           workspace_size,
           reserve_data,
           reserve_size));
-#else
+#elif defined(PADDLE_WITH_CUDA)
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::cudnnRNNForwardTraining(handle,
                                                 rnn.rnn_desc(),
@@ -405,7 +405,7 @@ void RnnKernel(const Context &dev_ctx,
 PD_REGISTER_KERNEL(rnn, GPU, ALL_LAYOUT, phi::RnnKernel, float) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
 }
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(rnn, GPU, ALL_LAYOUT, phi::RnnKernel, float, double) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
 }
diff --git a/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
index a7e4e32ed1d17..58cf7a273f540 100644
--- a/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
@@ -49,6 +49,8 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 
 #ifdef PADDLE_WITH_HIP
   hipMemset(p_output, 0, memset_bytes);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(p_output, 0, memset_bytes);
 #else
   cudaMemset(p_output, 0, memset_bytes);
 #endif
diff --git a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
index 85cc80e36b517..3aa20279bdd29 100644
--- a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
@@ -63,6 +63,8 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
   if (reduce_op == "SUM" || reduce_op == "MEAN") {
 #ifdef PADDLE_WITH_HIP
     hipMemset(p_output, 0, memset_bytes);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(p_output, 0, memset_bytes);
 #else
     cudaMemset(p_output, 0, memset_bytes);
 #endif
@@ -138,6 +140,8 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
 
 #ifdef PADDLE_WITH_HIP
     hipMemset(p_dst_count, 0, input_size * sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(p_dst_count, 0, input_size * sizeof(int));
 #else
     cudaMemset(p_dst_count, 0, input_size * sizeof(int));
 #endif
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
index d368c43a29753..a1c2a0dcf2214 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
@@ -166,6 +166,11 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpy(x_grad,
+                   x_grad_out.data<T>(),
+                   x_grad_out.numel() * sizeof(T),
+                   musaMemcpyDeviceToDevice);
 #else
         cudaMemcpy(x_grad,
                    x_grad_out.data<T>(),
@@ -243,6 +248,11 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpy(x_grad,
+                   x_grad_out.data<T>(),
+                   x_grad_out.numel() * sizeof(T),
+                   musaMemcpyDeviceToDevice);
 #else
         cudaMemcpy(x_grad,
                    x_grad_out.data<T>(),
@@ -289,6 +299,11 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpy(x_grad,
+                   x_grad_out.data<T>(),
+                   x_grad_out.numel() * sizeof(T),
+                   musaMemcpyDeviceToDevice);
 #else
         cudaMemcpy(x_grad,
                    x_grad_out.data<T>(),
@@ -358,6 +373,11 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpy(x_grad,
+                   x_grad_out.data<T>(),
+                   x_grad_out.numel() * sizeof(T),
+                   musaMemcpyDeviceToDevice);
 #else
         cudaMemcpy(x_grad,
                    x_grad_out.data<T>(),
@@ -493,6 +513,9 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper(
 #ifdef PADDLE_WITH_HIP
   hipMemset(x_grad_data, 0, memset_bytes_x);
   hipMemset(e_grad_data, 0, memset_bytes_e);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(x_grad_data, 0, memset_bytes_x);
+  musaMemset(e_grad_data, 0, memset_bytes_e);
 #else
   cudaMemset(x_grad_data, 0, memset_bytes_x);
   cudaMemset(e_grad_data, 0, memset_bytes_e);
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
index 764490bd1cb8b..2c8d6e1a58818 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -61,6 +61,8 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
   if (reduce_op == "SUM" || reduce_op == "MEAN") {
 #ifdef PADDLE_WITH_HIP
     hipMemset(out_data, 0, memset_bytes);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(out_data, 0, memset_bytes);
 #else
     cudaMemset(out_data, 0, memset_bytes);
 #endif
@@ -104,7 +106,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   int block_ = 256;
 #else
-  int block_ = 1024;
+  int block_ = 1024;  // CUDA & MUSA
 #endif
   if (reduce_op == "SUM" || reduce_op == "MEAN") {
     GraphSendUERecvSumCUDAFunctor<T> sum_functor;
@@ -158,6 +160,8 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
       int* dst_count_data = dst_count->data<int>();
 #ifdef PADDLE_WITH_HIP
       hipMemset(dst_count_data, 0, input_size * sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(dst_count_data, 0, input_size * sizeof(int));
 #else
       cudaMemset(dst_count_data, 0, input_size * sizeof(int));
 #endif
diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
index c50b1960d0056..408f4bf26593c 100644
--- a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
@@ -116,6 +116,11 @@ void CalculateGrad(const Context& ctx,
                 x_grad_out.data<T>(),
                 x_grad_out.numel() * sizeof(T),
                 hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemcpy(x_grad,
+                 x_grad_out.data<T>(),
+                 x_grad_out.numel() * sizeof(T),
+                 musaMemcpyDeviceToDevice);
 #else
       cudaMemcpy(x_grad,
                  x_grad_out.data<T>(),
@@ -198,6 +203,11 @@ void CalculateGrad(const Context& ctx,
                 x_grad_out.data<T>(),
                 x_grad_out.numel() * sizeof(T),
                 hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemcpy(x_grad,
+                 x_grad_out.data<T>(),
+                 x_grad_out.numel() * sizeof(T),
+                 musaMemcpyDeviceToDevice);
 #else
       cudaMemcpy(x_grad,
                  x_grad_out.data<T>(),
@@ -247,6 +257,9 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   hipMemset(x_grad_data, 0, memset_bytes_x);
   hipMemset(y_grad_data, 0, memset_bytes_y);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(x_grad_data, 0, memset_bytes_x);
+  musaMemset(y_grad_data, 0, memset_bytes_y);
 #else
   cudaMemset(x_grad_data, 0, memset_bytes_x);
   cudaMemset(y_grad_data, 0, memset_bytes_y);
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index d36687461bf6c..d86b7cfddcd78 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -197,6 +197,22 @@ PD_REGISTER_KERNEL(sgd,
 }
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+PD_REGISTER_KERNEL(sgd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#endif
+
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(sgd,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
index 3962d86c3e7b9..422950b79fa3e 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -28,6 +28,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/svd_kernel.cu b/paddle/phi/kernels/gpu/svd_kernel.cu
index 5f076850d438f..2f041d988a17f 100644
--- a/paddle/phi/kernels/gpu/svd_kernel.cu
+++ b/paddle/phi/kernels/gpu/svd_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+// HIP and MUSA not support cusolver
 
 #include "paddle/phi/kernels/svd_kernel.h"
 
@@ -265,4 +265,4 @@ PD_REGISTER_KERNEL(svd,  // cuda_only
                    float,
                    double) {}
 
-#endif  // not PADDLE_WITH_HIP
+#endif  // not PADDLE_WITH_HIP && not PADDLE_WITH_MUSA
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index bef328ec21a20..731d1b9605b70 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -198,7 +198,7 @@ void TopkKernel(const Context& dev_ctx,
                                                       gridx,
                                                       input_height,
                                                       largest));
-#else
+#else  // CUDA & MUSA
       FIXED_BLOCK_DIM(switch (phi::funcs::getMaxLength(k)) {
         FIXED_MAXLENGTH(
             phi::funcs::KeMatrixTopK<T, maxLength, kBlockDim>
@@ -307,7 +307,7 @@ void TopkKernel(const Context& dev_ctx,
                                                       gridx,
                                                       input_height,
                                                       largest));
-#else
+#else  // CUDA & MUSA
       FIXED_BLOCK_DIM(switch (phi::funcs::getMaxLength(k)) {
         FIXED_MAXLENGTH(phi::funcs::KeMatrixTopK<T, maxLength, kBlockDim>
                         <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index be630f85ce07d..e78c1293ee5c0 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -17,6 +17,9 @@
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
+#ifdef __MUSACC__
+#include "cub/cub.cuh"
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
index d4e0ca632e04d..be980f240bf34 100644
--- a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
@@ -377,6 +377,8 @@ void WeightedSampleNeighborsKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_CUDA
   const auto& exec_policy = thrust::cuda::par.on(dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+  const auto& exec_policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream());
 #endif
@@ -397,6 +399,18 @@ void WeightedSampleNeighborsKernel(const Context& dev_ctx,
                   cudaMemcpyDeviceToDevice,
                   dev_ctx.stream());
   cudaStreamSynchronize(dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpyAsync(&total_sample_size,
+                  sample_offset_ptr + bs,
+                  sizeof(int),
+                  musaMemcpyDeviceToHost,
+                  dev_ctx.stream());
+  musaMemcpyAsync(out_count_data,
+                  sample_count_ptr,
+                  sizeof(int) * bs,
+                  musaMemcpyDeviceToDevice,
+                  dev_ctx.stream());
+  musaStreamSynchronize(dev_ctx.stream());
 #else
   hipMemcpyAsync(&total_sample_size,
                  sample_offset_ptr + bs,
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
index 2a3c9515ac2ea..255948e1f6570 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) || !defined(PADDLE_WITH_MUSA)
 
 #include "paddle/phi/kernels/affine_grid_grad_kernel.h"
 #include "paddle/phi/backends/all_context.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 2c6e898fa25c8..b9f1b245f69a9 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -22,6 +22,8 @@
 #include "paddle/phi/core/kernel_registry.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
@@ -138,7 +140,7 @@ void ConvCudnnGradKernelImplV7(
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
   SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
 #endif
@@ -172,7 +174,7 @@ void ConvCudnnGradKernelImplV7(
     workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
     bwd_result.algo = search1::Find<T>(
         args1, exhaustive_search, deterministic, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
     using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
     workspace_size = std::max(workspace_size, bwd_result.workspace_size);
@@ -198,7 +200,7 @@ void ConvCudnnGradKernelImplV7(
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
     filter_result.algo = search2::Find<T>(
         args2, exhaustive_search, deterministic, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
     using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result =
         search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
@@ -213,7 +215,7 @@ void ConvCudnnGradKernelImplV7(
 #ifdef PADDLE_WITH_HIP
   // MIOPEN ONLY support beta to be 0.0f
   ScalingParamType<T> beta = 0.0f;
-#else
+#else  // CUDA & MUSA
   ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
 
 #endif
@@ -278,7 +280,7 @@ void ConvCudnnGradKernelImplV7(
           },
           workspace_size);
     }
-#else
+#else  // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args1,
                                                   bwd_result,
@@ -318,7 +320,7 @@ void ConvCudnnGradKernelImplV7(
                   workspace_size));
         },
         workspace_size);
-#else
+#else  // MUSA & CUDA
     ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
                                                     args2,
                                                     filter_result,
@@ -455,7 +457,7 @@ void ConvCudnnGradKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   // HIP MIOPEN ONLY SUPPORT NCHW format
   auto compute_format = phi::backends::gpu::DataLayout::kNCHW;
-#else
+#else  // MUSA & CUDA
 #if CUDNN_VERSION_MIN(8, 1, 0)
   const bool compute_in_nhwc =
       (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) &&
@@ -1061,7 +1063,7 @@ void ConvCudnnGradGradKernel(
   SearchResult<miopenConvFwdAlgorithm_t> fwd_result2;
   SearchResult<miopenConvBwdDataAlgorithm_t> data_result;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result1;
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result2;
   SearchResult<cudnnConvolutionBwdDataAlgo_t> data_result;
@@ -1091,7 +1093,7 @@ void ConvCudnnGradGradKernel(
       workspace_size = search1::GetWorkspaceSize(args1);
       fwd_result1.algo = search1::Find<T>(
           args1, exhaustive_search, false, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
       using search1 = SearchAlgorithm<ConvKind::kForward>;
       fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
       workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
@@ -1116,7 +1118,7 @@ void ConvCudnnGradGradKernel(
           std::max(workspace_size, search2::GetWorkspaceSize(args2));
       fwd_result2.algo = search2::Find<T>(
           args2, exhaustive_search, false, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
       using search2 = SearchAlgorithm<ConvKind::kForward>;
       fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
       workspace_size = std::max(
@@ -1142,7 +1144,7 @@ void ConvCudnnGradGradKernel(
     workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
     filter_result.algo = search3::Find<T>(
         args3, exhaustive_search, deterministic, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
     using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result =
         search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
@@ -1169,7 +1171,7 @@ void ConvCudnnGradGradKernel(
     workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
     data_result.algo = search4::Find<T>(
         args4, exhaustive_search, deterministic, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
     using search4 = SearchAlgorithm<ConvKind::kBackwardData>;
     data_result =
         search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
@@ -1226,7 +1228,7 @@ void ConvCudnnGradGradKernel(
                                                        workspace_size));
           },
           workspace_size);
-#else
+#else  // MUSA & CUDA
       ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                                args1,
                                                fwd_result1,
@@ -1345,7 +1347,7 @@ void ConvCudnnGradGradKernel(
                   workspace_size));
         },
         workspace_size);
-#else
+#else  // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args4,
                                                   data_result,
@@ -1540,7 +1542,7 @@ PD_REGISTER_KERNEL(depthwise_conv2d_double_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 15161dd61c697..99727e97360d6 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -23,6 +23,8 @@
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
@@ -84,7 +86,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   // MIOPEN need to set groups in cdesc in miopen_desc.h
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
-#else
+#else  // CUDA & MUSA
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
 #endif
@@ -151,7 +153,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   workspace_size = search::GetWorkspaceSize(args);
   fwd_result.algo = search::Find<T>(
       args, exhaustive_search, deterministic, workspace_size, ctx);
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
   using search = SearchAlgorithm<ConvKind::kForward>;
   fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
@@ -195,7 +197,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
                                                    workspace_size));
       },
       workspace_size);
-#else
+#else  // CUDA & MUSA
   ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                            args,
                                            fwd_result,
@@ -363,7 +365,7 @@ void ConvCudnnKernel(const Context& ctx,
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
   auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
   // HIP MIOPEN ONLY SUPPORT NCHW format
   auto compute_format = phi::backends::gpu::DataLayout::kNCHW;
 #else
@@ -651,7 +653,7 @@ PD_REGISTER_KERNEL(conv3d,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index 50bae0a8bca3e..a5a5f40063271 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -32,7 +32,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
-#else
+#elif defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
@@ -200,7 +200,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
   SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
 #endif
@@ -228,7 +228,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
     workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
     fwd_result.algo =
         search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
-#else
+#else  // MUSA & CUDA
     using search1 = SearchAlgorithm<ConvKind::kForward>;
     fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = std::max(
@@ -253,7 +253,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
     filter_result.algo =
         search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
     using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
@@ -292,7 +292,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size);
     }
-#else   // PADDLE_WITH_HIP
+#else   // CUDA & MUSA
     ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                              args1,
                                              fwd_result,
@@ -349,7 +349,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size);
     }
-#else   // PADDLE_WITH_HIP
+#else  // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
                                                     args2,
                                                     filter_result,
@@ -363,7 +363,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                                                     workspace_size,
                                                     &workspace_handle,
                                                     false);
-#endif  // PADDLE_WITH_HIP
+#endif
   }
 }
 
@@ -670,7 +670,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
   SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
   SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
   SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
@@ -700,7 +700,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = search1::GetWorkspaceSize(args1);
     bwd_result1.algo =
         search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
     using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
@@ -722,7 +722,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
     bwd_result2.algo =
         search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
     using search2 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
@@ -747,7 +747,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
     filter_result.algo =
         search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
     using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
     workspace_size = std::max(
@@ -773,7 +773,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
     fwd_result.algo =
         search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
-#else
+#else  // CUDA & MUSA
     using search4 = SearchAlgorithm<ConvKind::kForward>;
     fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
     workspace_size = std::max(
@@ -833,7 +833,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           },
           workspace_size);
     }
-#else   // PADDLE_WITH_HIP
+#else  // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args1,
                                                   bwd_result1,
@@ -847,7 +847,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                                   workspace_size,
                                                   &workspace_handle,
                                                   false);
-#endif  // PADDLE_WITH_HIP
+#endif
 
 #ifdef PADDLE_WITH_HIP
     for (int i = 0; i < groups; i++) {
@@ -886,7 +886,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           args2.idesc.desc(),
           transformed_ddout_channel_ + i * group_offset_out));
     }
-#else   // PADDLE_WITH_HIP
+#else  // CUDA & MUSA
     ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                   args2,
                                                   bwd_result2,
@@ -900,7 +900,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                                   workspace_size,
                                                   &workspace_handle,
                                                   true);
-#endif  // PADDLE_WITH_HIP
+#endif
 
     if ((!is_sys_pad) && (!channel_last)) {
       if (strides.size() == 2U) {
@@ -956,7 +956,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           },
           workspace_size);
     }
-#else   // PADDLE_WITH_HIP
+#else  // MUSA & CUDA
     ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
                                                     args3,
                                                     filter_result,
@@ -970,7 +970,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                                     workspace_size,
                                                     &workspace_handle,
                                                     false);
-#endif  // PADDLE_WITH_HIP
+#endif
   }
 
   if (dx) {
@@ -996,7 +996,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           },
           workspace_size);
     }
-#else   // PADDLE_WITH_HIP
+#else  // MUSA & CUDA
     ConvRunner<T, ConvKind::kForward>::Apply(ctx,
                                              args4,
                                              fwd_result,
@@ -1010,7 +1010,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                              workspace_size,
                                              &workspace_handle,
                                              false);
-#endif  // PADDLE_WITH_HIP
+#endif
 
     if (channel_last) {
       TransToChannelLast<Context, T>(ctx, &transformed_dx_channel, dx);
@@ -1097,7 +1097,7 @@ PD_REGISTER_KERNEL(conv3d_transpose_grad,
                    double,
                    float16,
                    phi::dtype::bfloat16) {}
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d_transpose_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index df360ab388a6d..494145a30414c 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -30,7 +30,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
-#else
+#elif defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
 #endif
@@ -191,7 +191,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   size_t workspace_size = 0;
 #ifdef PADDLE_WITH_HIP
   miopenConvBwdDataAlgorithm_t algo{};
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnConvolutionBwdDataAlgo_t algo{};
 #endif
   // ------------------- cudnn conv algorithm ---------------------
@@ -227,7 +227,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
   bwd_result.algo =
       search::Find<T>(args, false, deterministic, workspace_size, ctx);
-#else
+#elif defined(PADDLE_WITH_CUDA)
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
   using search = SearchAlgorithm<ConvKind::kBackwardData>;
   bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
@@ -262,7 +262,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
     };
     workspace_handle.RunFunc(cudnn_func, workspace_size);
   }
-#else   // PADDLE_WITH_HIP
+#else  // CUDA & MUSA
   ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
                                                 args,
                                                 bwd_result,
@@ -276,7 +276,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
                                                 workspace_size,
                                                 &workspace_handle,
                                                 false);
-#endif  // PADDLE_WITH_HIP
+#endif
 
   if (!is_sys_pad && strides.size() == 2U) {
     funcs::Slice<Context, T, 4>(ctx, &transformed_out, out, starts, ends, axes);
@@ -385,7 +385,7 @@ PD_REGISTER_KERNEL(conv3d_transpose,
                    double,
                    float16,
                    phi::dtype::bfloat16) {}
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(conv2d_transpose,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
index 1161040f2163f..10c1b148d5ab5 100644
--- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -217,7 +217,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
       layout, vectorize<int>(transformed_input.dims()));
   miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
       layout, vectorize<int>(transformed_output.dims()));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
       layout, vectorize<int>(transformed_input.dims()));
   cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
@@ -238,7 +238,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   miopenPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
 #endif
@@ -269,7 +269,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
                                                               input_grad_data,
                                                               pool_workspace));
     PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
-#else
+#elif defined(PADDLE_WITH_CUDA)
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle,
                                                              cudnn_pool_desc,
                                                              &alpha,
@@ -424,7 +424,7 @@ PD_REGISTER_KERNEL(pool3d_grad,
                    phi::Pool3dGradGPUDNNKernel,
                    float,
                    float16) {}
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(pool2d_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
index b1a79dd874068..da4dfd0ce2bde 100644
--- a/paddle/phi/kernels/gpudnn/pool_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -111,7 +111,7 @@ void PoolRawGPUDNNKernel(const Context& ctx,
     out_dims_vec[3] = output->dims()[2];
     out_dims_vec[4] = output->dims()[3];
     transformed_output.Resize(make_ddim(out_dims_vec));
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)
     // MIOPEN not support NHWC data layout
   } else if (data_format == str_NHWC) {
     layout = GPUDNNDataLayout::kNCHW;
@@ -155,7 +155,7 @@ void PoolRawGPUDNNKernel(const Context& ctx,
       layout, vectorize<int>(transformed_input.dims()));
   miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
       layout, vectorize<int>(transformed_output.dims()));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
       layout, vectorize<int>(transformed_input.dims()));
   cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
@@ -172,7 +172,7 @@ void PoolRawGPUDNNKernel(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   miopenPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
-#else
+#elif defined(PADDLE_WITH_CUDA)
   cudnnPoolingDescriptor_t cudnn_pool_desc =
       pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
 #endif
@@ -200,7 +200,7 @@ void PoolRawGPUDNNKernel(const Context& ctx,
                                     pool_workspace,
                                     pool_workernel_size_));
   PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
-#else
+#elif defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(
       dynload::cudnnPoolingForward(handle,
                                    cudnn_pool_desc,
@@ -295,7 +295,7 @@ PD_REGISTER_KERNEL(
     pool2d, GPUDNN, ALL_LAYOUT, phi::Pool2dGPUDNNKernel, float, float16) {}
 PD_REGISTER_KERNEL(
     pool3d, GPUDNN, ALL_LAYOUT, phi::Pool3dGPUDNNKernel, float, float16) {}
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(pool2d,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
index 72a5f37d14005..94a5c9bf5d54d 100644
--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
@@ -59,7 +59,7 @@ PD_REGISTER_KERNEL(softmax_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-#else
+#else  // CUDA & MUSA
 PD_REGISTER_KERNEL(softmax_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/group_norm_kernel.h b/paddle/phi/kernels/group_norm_kernel.h
index f3e39ddbeb328..3c8fecf1e6115 100644
--- a/paddle/phi/kernels/group_norm_kernel.h
+++ b/paddle/phi/kernels/group_norm_kernel.h
@@ -33,7 +33,8 @@ void GroupNormKernel(const Context& dev_ctx,
                      DenseTensor* mean,
                      DenseTensor* variance);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T, typename AccT = T>
 class GroupNormDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
index 821b065d2883a..705c8ee693414 100644
--- a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/clip_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
@@ -47,7 +47,7 @@ void ClipGradKernel(const Context& dev_ctx,
   auto max_ = max.to<T>();
   auto min_ = min.to<T>();
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   std::vector<const DenseTensor*> ins = {&out_grad, &x};
   std::vector<DenseTensor*> outs = {x_grad};
   auto functor = ClipGradFunctor<T>(min_, max_);
diff --git a/paddle/phi/kernels/impl/clip_kernel_impl.h b/paddle/phi/kernels/impl/clip_kernel_impl.h
index 7d327ef5c5dfa..e7324611ef340 100644
--- a/paddle/phi/kernels/impl/clip_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_kernel_impl.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/clip_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
@@ -60,7 +60,7 @@ void ClipKernel(const Context& dev_ctx,
   const T* x_data = x.data<T>();
   int64_t numel = x.numel();
   if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     std::vector<const DenseTensor*> ins = {&x};
     std::vector<DenseTensor*> outs = {out};
     auto functor = ClipFunctor<T>(min_, max_);
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index ebbbda04a01c2..2f3e8e2ed49be 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -88,7 +88,7 @@ void ComplexKernel(const Context& dev_ctx,
 
 // NOTE(chenfeiyu): be careful of the caveats of calling elementwise-related
 // facility functions
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   phi::funcs::ElementwiseCompute<RealAndImagToComplexFunctor<T>, T, C>(
       dev_ctx, x, y, RealAndImagToComplexFunctor<T>(), out);
 #else
diff --git a/paddle/phi/kernels/impl/diag_embed_impl.h b/paddle/phi/kernels/impl/diag_embed_impl.h
index a4430fde92343..e182d15167aaf 100644
--- a/paddle/phi/kernels/impl/diag_embed_impl.h
+++ b/paddle/phi/kernels/impl/diag_embed_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@@ -105,7 +105,7 @@ void DiagEmbedKernel(const Context& dev_ctx,
   strides.push_back(stride[dim1_] + stride[dim2_]);
   const auto dims = vectorize(x.dims());
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   thrust::device_vector<int64_t> dims_vec(dims);
   const int64_t* dims_arr = thrust::raw_pointer_cast(dims_vec.data());
   thrust::device_vector<int64_t> strides_vec(strides);
diff --git a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
index add72749d39e1..70960808c4ccc 100644
--- a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
@@ -45,7 +45,7 @@ struct DotGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
                   DenseTensor* tensor_dx,
                   DenseTensor* tensor_dy) {
     VLOG(1) << "enable route";
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       auto dout = EigenVector<T>::Flatten(*tensor_dout);
 
@@ -143,7 +143,7 @@ struct DotGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
                   const DenseTensor* tensor_dout,
                   DenseTensor* tensor_dx,
                   DenseTensor* tensor_dy) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       auto dout = EigenVector<T>::Flatten(*tensor_dout);
       if (tensor_dx) {
@@ -235,7 +235,7 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
                   DenseTensor* tensor_ddout) {
     const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr();
     const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       DenseTensor tensor_dout_help;
       auto& dev = *ctx.eigen_device();
@@ -430,7 +430,7 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
                   DenseTensor* tensor_ddout) {
     const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr();
     const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       auto x = EigenVector<T>::Flatten(*tensor_x);
@@ -620,7 +620,7 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
     const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr();
     const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr();
     const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= in_tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       DenseTensor in_tensor_x_help = Conj<T, DeviceContext>(ctx, *in_tensor_x);
@@ -1014,7 +1014,7 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
     const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr();
     const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr();
     const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (1 >= in_tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       bool d_dout_flag = false;
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 280c38633b462..a79d7c129975b 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -903,7 +903,7 @@ void HeavisideGradKernel(const Context& dev_ctx,
           HeavisideGradDy<T>());
 }
 
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T, typename MPType>
 HOSTDEVICE typename std::enable_if<std::is_integral<T>::value, T>::type
 compute_pow_grad_dx(T x, T y, T out, T dout) {
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index 0121f35b3cecb..d2992449e4b4e 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -17,7 +17,8 @@
 #include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__) || \
+    defined(__xpu__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
diff --git a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
index de4bb8d4bd173..58801643692b5 100644
--- a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
@@ -94,7 +94,7 @@ void FFTC2RGradKernel(const Context& ctx,
       out_grad.dims()[axes.back()] - x_grad->dims()[axes.back()];
   const phi::DDim strides = phi::stride(x_grad->dims());
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   const thrust::device_vector<int64_t> strides_g(phi::vectorize(strides));
   const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data());
 #else
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
index de59cb0c32ca1..5530dd22d37d7 100644
--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -86,7 +86,7 @@ struct IscloseFunctor<phi::CPUContext, T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 __global__ void IscloseCUDAKernel(const T* in_data,
                                   const T* other_data,
@@ -145,6 +145,8 @@ struct IscloseFunctor<phi::GPUContext, T> {
     grid = (grid > block) ? block : grid;
 #ifdef PADDLE_WITH_HIP
     hipMemset(out_data, true, num * sizeof(bool));
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(out_data, true, num * sizeof(bool));
 #else
     cudaMemset(out_data, true, num * sizeof(bool));
 #endif
diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
index 4829ae0a9f0c9..f1e7551e4ec0c 100644
--- a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
@@ -177,7 +177,7 @@ struct KronGradOpFunctor {
     const int64_t *p_stride_y = nullptr;
     const int64_t *p_stride_dout = nullptr;
     const int64_t *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     thrust::device_vector<int64_t> d_stride_x(ndims);
     thrust::device_vector<int64_t> d_stride_y(ndims);
     thrust::device_vector<int64_t> d_stride_dout(ndims);
@@ -231,7 +231,7 @@ struct KronGradOpFunctor {
     for_range(func);
 
 // reduce_sum along aixs 1
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     auto stream = dev_ctx.stream();  // it is a cuda device_context
     if (dx) {
       funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
diff --git a/paddle/phi/kernels/impl/kron_kernel_impl.h b/paddle/phi/kernels/impl/kron_kernel_impl.h
index e1fcb49949a74..3e31848e8e619 100644
--- a/paddle/phi/kernels/impl/kron_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_kernel_impl.h
@@ -20,7 +20,7 @@
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "thrust/device_vector.h"
 #endif
@@ -117,7 +117,7 @@ struct KronOpFunctor {
 
     const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
                   *p_stride_out = nullptr, *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     thrust::device_vector<int64_t> d_stride_x(ndims);
     thrust::device_vector<int64_t> d_stride_y(ndims);
     thrust::device_vector<int64_t> d_stride_out(ndims);
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 885827a36beab..fc50eb0626fdb 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #endif
 
@@ -53,7 +53,7 @@ struct ReduceSumForMatmulGrad<CPUContext, T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 struct ReduceSumForMatmulGrad<GPUContext, T> {
   void operator()(const GPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index cac393d636051..fa96a63d955d9 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -1006,7 +1006,6 @@ void MatmulWithFlattenKernel(const Context& dev_ctx,
   }
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
   blas.MatMul(x_matrix, y_matrix, out);
   if (z_dim.size() != 2) {
     out->Resize(z_dim);
diff --git a/paddle/phi/kernels/impl/polygamma_kernel_impl.h b/paddle/phi/kernels/impl/polygamma_kernel_impl.h
index 8b4274b0882c8..1a692a8ed594f 100644
--- a/paddle/phi/kernels/impl/polygamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/polygamma_kernel_impl.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #else
 #include "paddle/phi/kernels/funcs/for_range.h"
@@ -25,7 +25,7 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 __host__ __device__ T zeta(T x, T q) {
   /*
diff --git a/paddle/phi/kernels/impl/renorm_impl.h b/paddle/phi/kernels/impl/renorm_impl.h
index d206e29a69a6d..60e4f5ff692f1 100644
--- a/paddle/phi/kernels/impl/renorm_impl.h
+++ b/paddle/phi/kernels/impl/renorm_impl.h
@@ -17,11 +17,13 @@
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
+#elif defined(__MUSACC__)
+#include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -149,7 +151,7 @@ void RenormGradFunc(const phi::CPUContext& ctx UNUSED,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 __device__ __forceinline__ float inline_pow(float base, float exponent) {
   return pow(base, exponent);
 }
diff --git a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
index 806e2be66332c..4c0f236d0e90e 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
@@ -18,11 +18,13 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_grad_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
+#elif defined(__MUSACC__)
+#include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -33,7 +35,7 @@ namespace cub = hipcub;
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename IndexT>
@@ -104,7 +106,7 @@ void RepeatInterleaveWithTensorIndexGradKernel(
                         DataTypeToString(index_type),
                         DataTypeToString(DataType::INT32),
                         DataTypeToString(DataType::INT64)));
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
   auto output_dim = out_grad.dims();
   auto stride_dim = phi::stride(input_dim);
@@ -179,7 +181,7 @@ void RepeatInterleaveGradKernel(const Context& ctx,
   }
 
   DenseTensor index;
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   auto output_dim = out_grad.dims();
   auto stride_dim = phi::stride(input_dim);
   int64_t stride = stride_dim[dim];
diff --git a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
index b605081064008..0bcc1fa3f432e 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -29,7 +29,7 @@
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T, typename IndexT>
 __global__ void index_select_cuda_kernel(const T* input,
@@ -81,7 +81,7 @@ void RepeatInterleaveKernel(const Context& ctx,
     output_dim[dim] = index_size;
     out->Resize(phi::make_ddim(output_dim));
     phi::IndexSelectInner<Context, T, int>(ctx, &x_copy, index, out, dim);
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   } else {
     auto stride_dim = phi::stride(input_dim);
     int64_t stride = stride_dim[dim];
@@ -160,7 +160,7 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx,
       out->Resize(phi::make_ddim(output_dim));
       IndexSelectInner<Context, T, int64_t>(ctx, &x_copy, index, out, dim);
     }
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   } else {
     auto stride_dim = phi::stride(input_dim);
     int64_t stride = stride_dim[dim];
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
index 216d5e6100d6c..de17e513d9255 100644
--- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -64,7 +64,8 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx,
     phi::funcs::SetConstant<Context, T> set_zero;
     set_zero(dev_ctx, out, static_cast<T>(0));
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (!cpu_place) {
     DenseTensor length;
     length.Resize(phi::make_ddim({1}));
@@ -77,6 +78,11 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx,
                                          segment_ids_ptr + num_indices - 1,
                                          sizeof(IndexT),
                                          hipMemcpyDeviceToHost));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(length_data,
+                                          segment_ids_ptr + num_indices - 1,
+                                          sizeof(IndexT),
+                                          musaMemcpyDeviceToHost));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(length_data,
                                           segment_ids_ptr + num_indices - 1,
diff --git a/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h b/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
index 80834fae85411..fe39f807be8f6 100644
--- a/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
+++ b/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_ptr.h>
 #include <thrust/functional.h>
 #include <thrust/reduce.h>
@@ -64,7 +64,7 @@ void SequenceMaskKernel(const Context& ctx,
     if (x_numel == 0) {
       maxlen = 0;
     } else {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       VLOG(10)
           << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
       maxlen = static_cast<int>(
diff --git a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
index a661035ab5b74..dce2c60ed1937 100644
--- a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/squeeze_kernel.h"
 #include "paddle/phi/kernels/unsqueeze_kernel.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #endif
 
@@ -55,7 +55,7 @@ struct ReduceSumForSolvelGrad<CPUContext, T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T>
 struct ReduceSumForSolvelGrad<GPUContext, T> {
   void operator()(const GPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
index 640fd07a92a2b..b153068ab11c1 100644
--- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@@ -120,7 +120,7 @@ void TraceGradKernel(const Context& ctx,
   int64_t diag_size = len2 < len1 ? len2 : len1;
   int64_t pos = std::abs(offset) * offset_stride;
   if (diag_size > 0) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     thrust::device_vector<int64_t> output_vec(vectorize(output_stride));
     const int64_t* output_arr = thrust::raw_pointer_cast(output_vec.data());
     thrust::device_vector<int64_t> input_vec(vectorize(input_stride));
diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
index 0576742e349a8..a370ce2cf43c6 100644
--- a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/stack_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #endif
 namespace phi {
@@ -39,7 +39,7 @@ void UnStackGradKernel(const Context &dev_ctx,
   for (auto i = 0; i < axis; ++i) pre *= dim[i];
   for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   int total_num = pre * n * post;
 
   thrust::device_vector<const T *> device_x_vec(x_datas);
diff --git a/paddle/phi/kernels/impl/unstack_kernel_impl.h b/paddle/phi/kernels/impl/unstack_kernel_impl.h
index 102126a1e3307..fd2ecdf6fa6bf 100644
--- a/paddle/phi/kernels/impl/unstack_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/stack_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #endif
 
@@ -44,7 +44,7 @@ void UnStackKernel(const Context &dev_ctx,
   int total_num = dy->numel();
   int post = total_num / (n * pre);
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   thrust::device_vector<T *> device_dx_vec(dx_datas);
   auto dx_data_arr = device_dx_vec.data().get();
 #else
@@ -52,7 +52,7 @@ void UnStackKernel(const Context &dev_ctx,
 #endif
   phi::funcs::StackGradFunctorForRange(
       dev_ctx, dx_data_arr, dy_data, total_num, n, post);
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   // Wait() must be called because device_dx_vec may be destructed before
   // kernel ends
   dev_ctx.Wait();
diff --git a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
index f51041285aaee..64659bd1620a9 100644
--- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
@@ -139,7 +139,8 @@ class WarpRNNTFunctor {
     rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR;
     bool gpu = false;
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       gpu = true;
 #else
       PADDLE_THROW(errors::PreconditionNotMet(
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
index 4b86f2dfe6950..ccd4422e291f1 100644
--- a/paddle/phi/kernels/is_empty_kernel.cc
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -43,7 +43,8 @@ PD_REGISTER_KERNEL(is_empty,
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(is_empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
index b3fe46a1cd310..f0ca8f763822f 100644
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -21,6 +21,8 @@
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
+#include <mudnn.h>
+
 namespace phi {
 
 template <typename T, typename Context>
@@ -72,7 +74,7 @@ void AddKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const DenseTensor& y,
                DenseTensor* out) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
   if (x.dtype() == phi::DataType::FLOAT32 &&
       (y.dtype() == phi::DataType::BFLOAT16 ||
        y.dtype() == phi::DataType::FLOAT16)) {
@@ -81,7 +83,7 @@ void AddKernel(const Context& dev_ctx,
   } else {
 #endif
     AddCudaFunctor<T, Context>(dev_ctx, x, y, -1, out);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 }
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index e88714c370be9..0f8e6b788cfaf 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -87,7 +87,8 @@ void ElementwisePowKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(maximum,
                    KPS,
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
index 2fddcec2278c9..89e5912a8d539 100644
--- a/paddle/phi/kernels/layer_norm_kernel.h
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -30,7 +30,8 @@ void LayerNormKernel(const Context& ctx,
                      DenseTensor* mean,
                      DenseTensor* variance);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 template <typename T, typename U>
 class LayerNormDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index 49d69a23fedd1..71d9654a8ea80 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -117,7 +117,8 @@ void MemcpyKernel(const Context& dev_ctx,
       dev_ctx.HostAlloc(out, out->dtype());
       Copy(dev_ctx, x, CPUPlace(), true, out);
       break;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
     case 1: /* CUDAPlace */
       dev_ctx.Alloc(out, x.dtype());
       Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
@@ -162,7 +163,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc
index 89a0c63c8a495..2454a66204c29 100644
--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
@@ -62,7 +62,8 @@ PD_REGISTER_KERNEL(npu_identity,
                    bool,
                    phi::dtype::float16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(npu_identity,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 30c2636a2bde9..cf0e314bc8edc 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -17,6 +17,9 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_fp16.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_fp16.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 2a3579d99cfe6..8778dc144e503 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -20,6 +20,10 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_fp16.h>
+#endif
 #include "paddle/phi/core/ddim.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc
index ea3faaebd9582..ce199e3fdfe4e 100644
--- a/paddle/phi/kernels/prod_kernel.cc
+++ b/paddle/phi/kernels/prod_kernel.cc
@@ -40,7 +40,8 @@ PD_REGISTER_KERNEL(prod_infer,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(prod_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc
index 9e799f0d219fc..da0bc669ff82c 100644
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
@@ -40,7 +40,8 @@ void AllKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 #endif
 
diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc
index 87e432c5c20a7..415a0f46a25fa 100644
--- a/paddle/phi/kernels/reduce_amax_kernel.cc
+++ b/paddle/phi/kernels/reduce_amax_kernel.cc
@@ -34,7 +34,8 @@ void AMaxKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amax, CPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amax, GPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc
index a355da64230dc..cf5a7d1b96a57 100644
--- a/paddle/phi/kernels/reduce_amin_kernel.cc
+++ b/paddle/phi/kernels/reduce_amin_kernel.cc
@@ -34,7 +34,8 @@ void AMinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amin, CPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amin, GPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc
index 9d162f8e02033..c563433f5585f 100644
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -33,7 +33,8 @@ void AnyKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 #endif
 
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index 21b02412d31ca..1b3f98747d098 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -41,7 +41,8 @@ PD_REGISTER_KERNEL(mean,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index ff50e9d1077b0..d702cab1f521b 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -57,6 +57,18 @@ PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 #endif
 
+#if defined(PADDLE_WITH_MUSA)
+PD_REGISTER_KERNEL(min,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MinKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
+#endif
+
 #if defined(PADDLE_WITH_XPU_KP) && !defined(PADDLE_WITH_XPU)
 PD_REGISTER_KERNEL(min, KPS, ALL_LAYOUT, phi::MinKernel, float) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index de9688d4e60aa..d60ee7bd049ab 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -53,7 +53,8 @@ PD_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(sum,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reverse_kernel.cc b/paddle/phi/kernels/reverse_kernel.cc
index 771acacedf024..5c87833b1b80f 100644
--- a/paddle/phi/kernels/reverse_kernel.cc
+++ b/paddle/phi/kernels/reverse_kernel.cc
@@ -61,7 +61,8 @@ PD_REGISTER_KERNEL(reverse_array,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(reverse_array,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc
index 4a27d0763a235..4206e145a820e 100644
--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
@@ -49,7 +49,8 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(square_sr,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
index 081d85e68c959..6cc76abf24467 100644
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -41,7 +41,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index dccbba6947a1b..87ff4ef070b83 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -84,7 +84,8 @@ PD_REGISTER_KERNEL(multiply_sr,
                    complex64,
                    complex128) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(multiply_raw_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index e04139448dddc..ee74928204c00 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -15,7 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/selected_rows/full_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/common/bfloat16.h"
@@ -54,7 +55,8 @@ PD_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
index d68688a7e400a..76d967477d23e 100644
--- a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -15,7 +15,8 @@
 #include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
@@ -51,7 +52,8 @@ PD_REGISTER_KERNEL(isfinite_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(isinf_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
index a5d2e66787316..3d0832d560dd4 100644
--- a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
@@ -41,7 +41,8 @@ PD_REGISTER_KERNEL(merge_selected_rows,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(merge_selected_rows,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 38a0cb75101b7..19d7d4d9ea4d2 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -54,7 +54,8 @@ PD_REGISTER_KERNEL(scale_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index f44a6a8dfafc5..fe19262336a6c 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -52,7 +52,8 @@ PD_REGISTER_KERNEL(shape_sr,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/uniform_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
index 0af5d8788c71f..6c161f6b1ba9b 100644
--- a/paddle/phi/kernels/selected_rows/uniform_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
@@ -77,7 +77,8 @@ PD_REGISTER_KERNEL(uniform_sr,
                    double,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(uniform_raw_sr,
                    GPU,
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
index c4190a5f59b62..0ebd114b04e5f 100644
--- a/paddle/phi/kernels/shape_kernel.cc
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -51,7 +51,8 @@ PD_REGISTER_KERNEL(shape,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc
index 49a377ca70f67..4362bb75ec382 100644
--- a/paddle/phi/kernels/sparse/empty_kernel.cc
+++ b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -82,7 +82,8 @@ PD_REGISTER_KERNEL(empty_like_csr,
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(empty_like_coo,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
index aaed804c92657..3366a86850bd2 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -89,6 +89,8 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx,
 // 3. sort (indices, values index)
 #ifdef PADDLE_WITH_HIP
   thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::sort_by_key(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -100,6 +102,8 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx,
   thrust::pair<IntT*, int*> new_end =
 #ifdef PADDLE_WITH_HIP
       thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+      thrust::unique_by_key(thrust::musa::par.on(dev_ctx.stream()),
 #else
       thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
index 689629c939338..660747b564fb8 100644
--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -19,6 +19,9 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <cub/block/block_scan.cuh>
 #endif
+#ifdef __MUSACC__
+#include <cub/block/block_scan.cuh>
+#endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
@@ -603,6 +606,8 @@ inline void CallThrustScan(const GPUContext& dev_ctx,
                            int* h_offsets_ptr) {
 #ifdef PADDLE_WITH_HIP
   thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -836,6 +841,8 @@ int ProductRuleBook(const Context& dev_ctx,
     // 2. remove -1
 #ifdef PADDLE_WITH_HIP
     IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+    IntT* last = thrust::remove(thrust::musa::par.on(dev_ctx.stream()),
 #else
     IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -884,6 +891,8 @@ int ProductRuleBook(const Context& dev_ctx,
         index_flags_ptr, index_flags.numel(), out_index_table_ptr);
 #ifdef PADDLE_WITH_HIP
     thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+    thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
     thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index a7dcb6d514830..3bc6cd3b9ab92 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -78,6 +78,8 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
                                      sizeof(IntT) * len,
 #ifdef PADDLE_WITH_HIP
                                      hipMemcpyDeviceToDevice,
+#elif defined(PADDLE_WITH_MUSA)
+                                     musaMemcpyDeviceToDevice,
 #else
                                      cudaMemcpyDeviceToDevice,
 #endif
@@ -86,6 +88,8 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
 // performance, but thrust::merge_by_key limited by data size
 #ifdef PADDLE_WITH_HIP
   thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::sort_by_key(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -97,6 +101,8 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
   thrust::pair<IntT*, int*> new_end =
 #ifdef PADDLE_WITH_HIP
       thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+      thrust::unique_by_key(thrust::musa::par.on(dev_ctx.stream()),
 #else
       thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -348,6 +354,8 @@ int ProductRuleBook(const Context& dev_ctx,
 // 2. remove -1
 #ifdef PADDLE_WITH_HIP
   IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  IntT* last = thrust::remove(thrust::musa::par.on(dev_ctx.stream()),
 #else
   IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -364,6 +372,8 @@ int ProductRuleBook(const Context& dev_ctx,
       sizeof(IntT),
 #ifdef PADDLE_WITH_HIP
       hipMemcpyDeviceToHost,
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemcpyDeviceToHost,
 #else
       cudaMemcpyDeviceToHost,
 #endif
@@ -388,6 +398,8 @@ int ProductRuleBook(const Context& dev_ctx,
     IntT* bound_ptr = bound.data<IntT>();
 #ifdef PADDLE_WITH_HIP
     thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+    thrust::lower_bound(thrust::musa::par.on(dev_ctx.stream()),
 #else
     thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -415,6 +427,8 @@ int ProductRuleBook(const Context& dev_ctx,
 // remove -1
 #ifdef PADDLE_WITH_HIP
     IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+    IntT* last = thrust::remove(thrust::musa::par.on(dev_ctx.stream()),
 #else
     IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -428,6 +442,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                        sizeof(IntT),
 #ifdef PADDLE_WITH_HIP
                                        hipMemcpyDeviceToHost,
+#elif defined(PADDLE_WITH_MUSA)
+                                       musaMemcpyDeviceToHost,
 #else
                                        cudaMemcpyDeviceToHost,
 #endif
@@ -438,6 +454,8 @@ int ProductRuleBook(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -450,6 +468,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                      kernel_size * sizeof(int),
 #ifdef PADDLE_WITH_HIP
                                      hipMemcpyDeviceToHost,
+#elif defined(PADDLE_WITH_MUSA)
+                                     musaMemcpyDeviceToHost,
 #else
                                      cudaMemcpyDeviceToHost,
 #endif
@@ -460,6 +480,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                      kernel_size * sizeof(int),
 #ifdef PADDLE_WITH_HIP
                                      hipMemcpyDeviceToHost,
+#elif defined(PADDLE_WITH_MUSA)
+                                     musaMemcpyDeviceToHost,
 #else
                                      cudaMemcpyDeviceToHost,
 #endif
@@ -501,6 +523,13 @@ int ProductRuleBook(const Context& dev_ctx,
         sizeof(IntT),
         hipMemcpyDeviceToHost,
         dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+    phi::backends::gpu::GpuMemcpyAsync(
+        &out_non_zero_num,
+        rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+        sizeof(IntT),
+        musaMemcpyDeviceToHost,
+        dev_ctx.stream());
 #else
     phi::backends::gpu::GpuMemcpyAsync(
         &out_non_zero_num,
diff --git a/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu b/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
index 47daa1eae19ed..b9b340da8caee 100644
--- a/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
@@ -43,6 +43,8 @@ void ElementWiseAddCooGPUKernel(const GPUContext& dev_ctx,
   const IntT* y_indices_ptr = y_indices.data<IntT>();
 #ifdef PADDLE_WITH_HIP
   bool is_same = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  bool is_same = thrust::equal(thrust::musa::par.on(dev_ctx.stream()),
 #else
   bool is_same = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
index 7dbdbe2acc992..12031bbaa12a1 100644
--- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
@@ -68,7 +68,7 @@ void MatmulCooDenseGradKernel(const Context& dev_ctx,
     set_zero(dev_ctx, dy, static_cast<T>(0.0f));
     sparse_blas.SPMM(
         true, false, static_cast<T>(1), x_csr, dout, static_cast<T>(0), dy);
-#elif defined(PADDLE_WITH_CUDA)
+#elif defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
     sparse_blas.SPMM(
         true, false, static_cast<T>(1), x, dout, static_cast<T>(0), dy);
 #endif
@@ -84,6 +84,10 @@ void MatmulCooDenseGradKernel(const Context& dev_ctx,
                                  "rocsparse_sddmm with transpose, which is "
                                  "supported from "
                                  "ROCM 4.3.0"));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "backward of 'sparse.matmul' use musparseSDDMM, which is supported from "
+      "MUSA"));
 #endif
 #endif
 }
@@ -135,6 +139,10 @@ void MatmulCsrDenseGradKernel(const Context& dev_ctx,
                                  "rocsparse_sddmm with transpose, which is "
                                  "supported from "
                                  "ROCM 4.3.0"));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "backward of 'sparse.matmul' use musparseSDDMM, which is supported from "
+      "MUSA"));
 #endif
 #endif
 }
diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
index 3f0ec2c2713e5..913581710dc3f 100644
--- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
@@ -103,6 +103,8 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
 // 2. max pool
 #ifdef PADDLE_WITH_HIP
   thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::fill(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
index f47accfc8eff8..3f2eef2bf9c3d 100644
--- a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
@@ -178,6 +178,8 @@ void SliceCooGPUCompute(const Context& dev_ctx,
   d_out_nnz_indices.Resize({out_nnz});
 #ifdef PADDLE_WITH_HIP
   thrust::sort(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::sort(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::sort(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -322,6 +324,8 @@ void SliceCsrTensor2D(const Context& dev_ctx,
                                                         out_crows_data);
 #ifdef PADDLE_WITH_HIP
   thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -471,6 +475,8 @@ void SliceCsrTensor3D(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -536,6 +542,8 @@ void SliceCsrTensor3D(const Context& dev_ctx,
   int64_t out_nnz =
 #ifdef PADDLE_WITH_HIP
       thrust::reduce(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+      thrust::reduce(thrust::musa::par.on(dev_ctx.stream()),
 #else
       thrust::reduce(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -545,6 +553,8 @@ void SliceCsrTensor3D(const Context& dev_ctx,
     int64_t st = i * (out_n_rows + 1);
 #ifdef PADDLE_WITH_HIP
     thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+    thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
     thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -554,6 +564,8 @@ void SliceCsrTensor3D(const Context& dev_ctx,
   }
 #ifdef PADDLE_WITH_HIP
   thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
index cf3dc79c8edd0..fb079a757550f 100644
--- a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
@@ -216,6 +216,9 @@ void SoftmaxCooGradGPUKernel(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
   bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
+  bool is_same_offset = thrust::equal(thrust::musa::par.on(dev_ctx.stream()),
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
   bool is_same_offset = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()),
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 084cb0e60bb6d..8db7c5fc2dca7 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -19,6 +19,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/rocsparse.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/dynload/musparse.h"
 #endif
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
@@ -132,6 +134,8 @@ void DenseToCooKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::remove(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
@@ -228,7 +232,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
   if (x.nnz() <= 0) {
 #ifdef PADDLE_WITH_HIP
     DenseTensor indices = phi::Empty<int>(dev_ctx, {sparse_dim, non_zero_num});
-#else
+#else  // MUSA and CUDA
     DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
 #endif
     DenseTensor values = phi::EmptyLike<T, GPUContext>(dev_ctx, x.values());
@@ -243,7 +247,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
   const auto& csr_cols = Cast<IntT>(dev_ctx, x.cols(), DataType::INT32);
   const int* csr_crows_data = csr_crows.template data<int>();
   const int* csr_cols_data = csr_cols.template data<int>();
-#else
+#else  // MUSA & CUDA
   const auto& csr_crows = x.crows();
   const auto& csr_cols = x.cols();
   const IntT* csr_crows_data = csr_crows.data<IntT>();
@@ -260,7 +264,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
   int* coo_indices = indices.data<int>();
   int* coo_rows_data = coo_indices;
   int* coo_cols_data = coo_rows_data + non_zero_num;
-#else
+#else  // MUSA & CUDA
   DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
   DenseTensor offsets = phi::Empty<IntT>(dev_ctx, {batches});
   IntT* coo_indices = indices.data<IntT>();
@@ -278,6 +282,15 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
     PADDLE_THROW(
         phi::errors::Unimplemented("'rocsparse_csr2coo' only supports batches "
                                    "with a value of 1 currently."));
+#elif defined(PADDLE_WITH_MUSA)
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
+    GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
+        csr_crows_data, rows, batches, offsets_ptr);
+
+    thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
+                           offsets_ptr,
+                           offsets_ptr + batches,
+                           offsets_ptr);
 #else
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
     GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
@@ -299,7 +312,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
                                     coo_rows_data,
                                     rocsparse_index_base_zero);
   });
-#else
+#else  // MUSA & CUDA
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
   config.block_per_grid.y = batches;
   ConvertCsrCrowsToCooRows<IntT>
@@ -310,7 +323,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
                                      csr_cols_data,
 #ifdef PADDLE_WITH_HIP
                                      sizeof(int) * non_zero_num,
-#else
+#else  // MUSA & CUDA
                                      sizeof(IntT) * non_zero_num,
 #endif
                                      gpuMemcpyDeviceToDevice,
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index 064867610d719..604ca3f23b2b6 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -81,7 +81,8 @@ PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(values_coo_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc
index 473acf9d7a1d1..0130db9a7af37 100644
--- a/paddle/phi/kernels/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/squeeze_grad_kernel.cc
@@ -49,7 +49,8 @@ PD_REGISTER_KERNEL(squeeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(squeeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_kernel.cc b/paddle/phi/kernels/squeeze_kernel.cc
index d495b040921b5..956fd3280c38c 100644
--- a/paddle/phi/kernels/squeeze_kernel.cc
+++ b/paddle/phi/kernels/squeeze_kernel.cc
@@ -74,7 +74,8 @@ PD_REGISTER_KERNEL(squeeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(squeeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.cc b/paddle/phi/kernels/strided_slice_grad_kernel.cc
index 7582f751bf16a..8a17d69b84901 100644
--- a/paddle/phi/kernels/strided_slice_grad_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_grad_kernel.cc
@@ -55,7 +55,8 @@ PD_REGISTER_KERNEL(strided_slice_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(strided_slice_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_kernel.cc b/paddle/phi/kernels/strided_slice_kernel.cc
index 68377dbe8468e..e396f277a7e36 100644
--- a/paddle/phi/kernels/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_kernel.cc
@@ -46,7 +46,8 @@ PD_REGISTER_KERNEL(strided_slice,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(strided_slice,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/case_utils.h b/paddle/phi/kernels/strings/case_utils.h
index 66744c6915bc6..acbd2cce88b38 100644
--- a/paddle/phi/kernels/strings/case_utils.h
+++ b/paddle/phi/kernels/strings/case_utils.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/kernels/strings/unicode.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 
diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h
index 36cad02618424..d061ef4da2aca 100644
--- a/paddle/phi/kernels/strings/gpu/copy_utils.h
+++ b/paddle/phi/kernels/strings/gpu/copy_utils.h
@@ -23,7 +23,8 @@ limitations under the License. */
 namespace phi {
 namespace strings {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 __global__ void SerializeStringsData(const phi::dtype::pstring* src_str,
                                      uint8_t* strings_data,
                                      int32_t* strings_offset,
@@ -83,6 +84,9 @@ int GetAllStringsSize(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   phi::backends::gpu::GpuMemcpyAsync(
       &num, nums_ptr, sizeof(int), hipMemcpyDeviceToHost, dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+  phi::backends::gpu::GpuMemcpyAsync(
+      &num, nums_ptr, sizeof(int), musaMemcpyDeviceToHost, dev_ctx.stream());
 #else
   phi::backends::gpu::GpuMemcpyAsync(
       &num, nums_ptr, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream());
@@ -146,7 +150,8 @@ void DeserializeOnCPU(const Context& dev_ctx,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 void SerializeOnGPU(const phi::GPUContext& dev_ctx,
                     const StringTensor& src,
                     DenseTensor* dst) {
@@ -179,6 +184,9 @@ void DeserializeOnGPU(const phi::GPUContext& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   phi::backends::gpu::GpuMemcpySync(
       &numel, strings_data, sizeof(numel), hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+  phi::backends::gpu::GpuMemcpySync(
+      &numel, strings_data, sizeof(numel), musaMemcpyDeviceToHost);
 #else
   phi::backends::gpu::GpuMemcpySync(
       &numel, strings_data, sizeof(numel), cudaMemcpyDeviceToHost);
diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.cc b/paddle/phi/kernels/strings/strings_empty_kernel.cc
index 22a43ceaff1c1..d225803cbc5be 100644
--- a/paddle/phi/kernels/strings/strings_empty_kernel.cc
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.cc
@@ -49,7 +49,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     ALL_LAYOUT,
     phi::strings::EmptyLikeKernel<phi::CPUContext>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_empty,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/unicode.cc b/paddle/phi/kernels/strings/unicode.cc
index 9f636809de876..c2293dd62937d 100644
--- a/paddle/phi/kernels/strings/unicode.cc
+++ b/paddle/phi/kernels/strings/unicode.cc
@@ -46,7 +46,8 @@ const uint16_t* GetCharcasesMap() {
   return reinterpret_cast<const uint16_t*>(utils_map[0]);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 const uint8_t* GetGPUUniflagMap() {
   if (utils_map[3] == nullptr) {
@@ -57,6 +58,10 @@ const uint8_t* GetGPUUniflagMap() {
     hipMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
     phi::backends::gpu::GpuMemcpySync(
         gpu_uniflag, cpu_uniflag, size, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
+    phi::backends::gpu::GpuMemcpySync(
+        gpu_uniflag, cpu_uniflag, size, musaMemcpyHostToDevice);
 #else
     cudaMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
     phi::backends::gpu::GpuMemcpySync(
@@ -76,6 +81,10 @@ const uint16_t* GetGPUCharcasesMap() {
     hipMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
     phi::backends::gpu::GpuMemcpySync(
         gpu_charcases, cpu_charcases, size, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
+    phi::backends::gpu::GpuMemcpySync(
+        gpu_charcases, cpu_charcases, size, musaMemcpyHostToDevice);
 #else
     cudaMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
     phi::backends::gpu::GpuMemcpySync(
diff --git a/paddle/phi/kernels/strings/unicode.h b/paddle/phi/kernels/strings/unicode.h
index 45e41b72d086c..0e2d129753f52 100644
--- a/paddle/phi/kernels/strings/unicode.h
+++ b/paddle/phi/kernels/strings/unicode.h
@@ -188,7 +188,8 @@ HOSTDEVICE inline void GetUTF8Str(const uint32_t* unicode_str,
 const uint8_t* GetUniFlagMap();
 const uint16_t* GetCharcasesMap();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 const uint8_t* GetGPUUniflagMap();
 const uint16_t* GetGPUCharcasesMap();
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 84b978436e163..d63b02fbc8772 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -71,7 +71,8 @@ void TransferLayoutGeneral(const Context& dev_ctx,
 
   out->Resize(phi::make_ddim(dst_dim));
   dev_ctx.Alloc(out, x.dtype());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   // In GPU fp16 model, we will insert many transfer_layout ops in
   // conv2d_fusion_layout_transfer_pass, so we optimize this kernel on GPU
   if (std::is_same<Context, phi::GPUContext>::value) {
@@ -221,7 +222,8 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  CPU,
                                  ALL_LAYOUT,
                                  phi::TransferLayoutKernel<phi::CPUContext>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
index 3c119db2c73d6..a4fc6cf91e13a 100644
--- a/paddle/phi/kernels/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
@@ -49,7 +49,8 @@ PD_REGISTER_KERNEL(unsqueeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(unsqueeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_kernel.cc b/paddle/phi/kernels/unsqueeze_kernel.cc
index c08c31da4ef0c..3cf8f1d002109 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_kernel.cc
@@ -80,7 +80,8 @@ PD_REGISTER_KERNEL(unsqueeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(unsqueeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 7295e86182734..3eb0f88dd8811 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -19,7 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 DECLARE_bool(enable_gpu_memory_usage_log);
 #endif
 
@@ -84,7 +85,8 @@ int main(int argc, char** argv) {
     VLOG(1) << "gtest undefok_string:" << undefok_string;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   if (strstr(undefok_str, "enable_gpu_memory_usage_log")) {
     VLOG(1) << "Set FLAGS_enable_gpu_memory_usage_log to true";
     FLAGS_enable_gpu_memory_usage_log = true;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 613e3807265a7..3856dd11d0cde 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -189,6 +189,9 @@ if(${len} GREATER_EQUAL 1)
       if(WITH_ROCM)
         target_link_libraries(${test_name} ${ROCM_HIPRTC_LIB})
       endif()
+      if(WITH_MUSA)
+        target_link_libraries(${test_name} ${MUSARTC_LIB})
+      endif()
       if(APPLE)
         target_link_libraries(
           ${test_name}
diff --git a/test/cpp/fluid/test_leaky_relu_grad_grad_functor.h b/test/cpp/fluid/test_leaky_relu_grad_grad_functor.h
index 5b51d8ddb00fc..f172e549b3973 100644
--- a/test/cpp/fluid/test_leaky_relu_grad_grad_functor.h
+++ b/test/cpp/fluid/test_leaky_relu_grad_grad_functor.h
@@ -95,7 +95,7 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
 
   int64_t limit = x.numel();
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   if (platform::is_gpu_place(place)) {
     auto &cuda_dev_ctx = dynamic_cast<phi::GPUContext &>(dev_ctx);
     functor(cuda_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
@@ -107,7 +107,7 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
     functor(cpu_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
     platform::ForRange<phi::CPUContext> for_range(cpu_dev_ctx, limit);
     for_range(actual_functor);
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
   }
 #endif
 
diff --git a/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cu b/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cu
index a79a376a2d995..7e9bdff690d39 100644
--- a/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cu
+++ b/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#if (defined(__NVCC__) || defined(__HIPCC__))
+#if (defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__))
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #endif
diff --git a/test/custom_op/custom_raw_op_kernel_op.h b/test/custom_op/custom_raw_op_kernel_op.h
index 24cea81b9eb91..7ce663c3c5b8b 100644
--- a/test/custom_op/custom_raw_op_kernel_op.h
+++ b/test/custom_op/custom_raw_op_kernel_op.h
@@ -59,7 +59,7 @@ struct ReluFunctor {
     for_range(functor);                                          \
   } while (0)
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     if (paddle::platform::is_gpu_place(place)) {
       LAUNCH_RELU_KERNEL(phi::GPUContext);
       return;
diff --git a/third_party/eigen3 b/third_party/eigen3
index f612df273689a..6ad1f10acbc31 160000
--- a/third_party/eigen3
+++ b/third_party/eigen3
@@ -1 +1 @@
-Subproject commit f612df273689a19d25b45ca4f8269463207c4fee
+Subproject commit 6ad1f10acbc311dd82b20cce7f5c305ae8c3eaa9