Skip to content

Commit

Permalink
[MTAI-484] feat(build): support operators in fluid for MUSA (PaddlePa…
Browse files Browse the repository at this point in the history
…ddle#55)

* [MTAI-484] feat(build): support fluid for MUSA

* [MTAI-484] doc(README.md): add a static graph python demo for MUSA

* [MTAI-484] fix(build): fix a macro bug in patch files

* [MTAI-484] fix(build): fix test in fluid
  • Loading branch information
caizhi-mt authored and mt-robot committed Aug 24, 2023
1 parent cd3c151 commit fc3d741
Show file tree
Hide file tree
Showing 39 changed files with 491 additions and 84 deletions.
46 changes: 44 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
- [Docker Image for User](#docker-image-for-user)
- [Useful Environment Variables](#useful-environment-variables)
- [Getting Started](#getting-started)
- [Demo](#demo)
- [Demo1](#demo1)
- [Demo2](#demo2)

<!-- tocstop -->

Expand Down Expand Up @@ -94,7 +95,7 @@ docker run -it --privileged --name=paddle_musa_dev --env MTHREADS_VISIBLE_DEVICE
| export MUSA_LAUNCH_BLOCKING=1 | Driver | Set the synchronization mode |

## Getting Started
### Demo
### Demo1

<details>
<summary>code</summary>
Expand All @@ -114,3 +115,44 @@ gpu_result = gpu_tensor1 + gpu_tensor2
print("gpu_result: ", gpu_result)
```
</details>

### Demo2

<details>
<summary>code</summary>

```python
import numpy as np
import paddle
paddle.enable_static()

# Creates a variable with fixed size [3, 2, 1]
# User can only feed data of the same shape to x
# the dtype is not set, so it will set "float32" by
# paddle.get_default_dtype(). You can use paddle.get_default_dtype() to
# change the global dtype
x = paddle.static.data(name='x', shape=[3, 2, 1])

# Creates a variable with changeable batch size -1.
# Users can feed data of any batch size into y,
# but size of each data sample has to be [2, 1]
y = paddle.static.data(name='y', shape=[-1, 2, 1], dtype='float32')

z = x + y

# In this example, we will feed x and y with np-ndarray "1"
# and fetch z, like implementing "1 + 1 = 2" in PaddlePaddle
feed_data = np.ones(shape=[3, 2, 1], dtype=np.float32)

exe = paddle.static.Executor(paddle.framework.CUDAPlace(0))
out = exe.run(paddle.static.default_main_program(),
feed={
'x': feed_data,
'y': feed_data
},
fetch_list=[z.name])

# np-ndarray of shape=[3, 2, 1], dtype=float32, whose elements are 2
print(out)
```
</details>
3 changes: 3 additions & 0 deletions cmake/musa.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,16 @@ list(APPEND MUSA_MCC_FLAGS -Wno-deprecated-copy-with-user-provided-copy)
list(APPEND MUSA_MCC_FLAGS -Wno-pragma-once-outside-header)
list(APPEND MUSA_MCC_FLAGS -Wno-return-type)
list(APPEND MUSA_MCC_FLAGS -Wno-sign-compare)
list(APPEND MUSA_MCC_FLAGS -Wno-overloaded-virtual)
list(APPEND MUSA_MCC_FLAGS -Wno-mismatched-tags)
list(APPEND MUSA_MCC_FLAGS -Wno-pessimizing-move)
list(APPEND MUSA_MCC_FLAGS -Wno-unused-but-set-variable)
list(APPEND MUSA_MCC_FLAGS -Wno-bitwise-instead-of-logical)
list(APPEND MUSA_MCC_FLAGS -Wno-format)
list(APPEND MUSA_MCC_FLAGS -Wno-self-assign)
list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion)
list(APPEND MUSA_MCC_FLAGS -Wno-literal-range)
list(APPEND MUSA_MCC_FLAGS -Wno-unused-private-field)
list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
list(APPEND MUSA_MCC_FLAGS -Wno-unused-value)
Expand Down
46 changes: 45 additions & 1 deletion cmake/operators.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ function(register_cu_kernel TARGET)
${TARGET}
SRCS ${cu_srcs}
DEPS ${op_library_DEPS} ${op_common_deps})
elseif(WITH_MUSA)
musa_library(${TARGET} SRCS ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
elseif(WITH_ROCM)
hip_library(
${TARGET}
Expand Down Expand Up @@ -208,6 +211,28 @@ function(op_library TARGET)
list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu)
endif()
endif()
if(WITH_MUSA)
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
endif()
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
# rename in KP: .kps -> .cu
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps
${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
endif()
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
set(PART_CUDA_KERNEL_FILES
${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
${PART_CUDA_KERNEL_FILES}
PARENT_SCOPE)
list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
endif()
endif()
if(WITH_ROCM)
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND hip_cc_srcs ${TARGET}.cu.cc)
Expand Down Expand Up @@ -275,6 +300,10 @@ function(op_library TARGET)
list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(WITH_MUSA AND ${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
elseif(WITH_MUSA AND ${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
Expand All @@ -285,7 +314,11 @@ function(op_library TARGET)
list(APPEND xpu_kp_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
elseif((WITH_ROCM OR WITH_GPU) AND ${src} MATCHES ".*\\.kps$")
elseif(
(WITH_ROCM
OR WITH_GPU
OR WITH_MUSA)
AND ${src} MATCHES ".*\\.kps$")
string(REPLACE ".kps" ".cu" src_cu ${src})
file(COPY ${src} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${src}
Expand Down Expand Up @@ -391,6 +424,17 @@ function(op_library TARGET)
SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs}
${mkldnn_cc_srcs} ${hip_srcs}
DEPS ${op_library_DEPS} ${op_common_deps})
elseif(WITH_MUSA)
musa_library(
${TARGET}
SRCS
${cc_srcs}
${cu_cc_srcs}
${cu_srcs}
${mkldnn_cc_srcs}
DEPS
${op_library_DEPS}
${op_common_deps})
elseif(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
xpu_library(
${TARGET}
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/eager/auto_code_generator/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ if(WITH_ROCM)
target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB})
endif()

if(WITH_MUSA)
target_link_libraries(eager_generator ${MUSARTC_LIB})
endif()

if(WITH_CINN)
target_link_libraries(eager_generator ${PYTHON_LIBRARIES})
endif()
Expand Down
18 changes: 18 additions & 0 deletions paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,16 @@ elseif(WITH_ROCM)
tensor_util_test
SRCS tensor_util_test.cc tensor_util_test.cu
DEPS tensor dlpack_tensor isfinite_op)
elseif(WITH_MUSA)
musa_test(
tensor_util_test
SRCS
tensor_util_test.cc
tensor_util_test.cu
DEPS
tensor
dlpack_tensor
isfinite_op)
else()
cc_test(
tensor_util_test
Expand Down Expand Up @@ -183,6 +193,8 @@ elseif(WITH_ROCM)
lod_tensor_gpu_test
SRCS lod_tensor_test.cu
DEPS lod_tensor)
elseif(WITH_MUSA)
musa_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
endif()

cc_library(
Expand Down Expand Up @@ -312,6 +324,10 @@ elseif(WITH_ROCM)
data_type_transform_test
SRCS data_type_transform_test.cc data_type_transform_test.cu
DEPS data_type_transform)
elseif(WITH_MUSA)
musa_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
musa_test(data_type_transform_test SRCS data_type_transform_test.cc
data_type_transform_test.cu DEPS data_type_transform)
elseif(WITH_XPU)
cc_library(
data_type_transform
Expand Down Expand Up @@ -566,6 +582,8 @@ elseif(WITH_ROCM)
op_registry_test
SRCS op_registry_test.cc
DEPS op_registry)
elseif(WITH_MUSA)
musa_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
endif()

if(WITH_PYTHON)
Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/framework/data_feed.cc
Original file line number Diff line number Diff line change
Expand Up @@ -269,12 +269,14 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
#elif defined(PADDLE_WITH_HIP)
hipMemcpy(dst, src, size, hipMemcpyHostToDevice);
#elif defined(PADDLE_WITH_MUSA)
musaMemcpy(dst, src, size, musaMemcpyHostToDevice);
#elif defined(PADDLE_WITH_XPU_KP)
xpu_memcpy(dst, src, size, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"Not supported GPU/ROCM, please compile with option WITH_GPU=ON or "
"WITH_ROCM=ON."));
"WITH_ROCM=ON or WITH_MUSA=ON."));
#endif
}
}
Expand Down
11 changes: 11 additions & 0 deletions paddle/fluid/framework/fleet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,17 @@ if(WITH_BOX_PS)
SRCS box_wrapper.cc box_wrapper.cu
DEPS framework_proto lod_tensor box_ps)
endif()
if(WITH_MUSA)
musa_library(
box_wrapper
SRCS
box_wrapper.cc
box_wrapper.cu
DEPS
framework_proto
lod_tensor
box_ps)
endif()
else()
cc_library(
box_wrapper
Expand Down
15 changes: 15 additions & 0 deletions paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,18 @@ if(WITH_ROCM)
SRCS heter_ps.cu
DEPS heter_comm)
endif()
if(WITH_MUSA)
musa_library(
heter_comm
SRCS
heter_comm.h
feature_value.h
heter_resource.cc
heter_resource.h
hashtable.h
DEPS
cub
device_context)
musa_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
musa_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
endif()
8 changes: 6 additions & 2 deletions paddle/fluid/framework/ir/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,9 @@ cc_test(
SRCS relu6_fuse_pass_test.cc
DEPS relu6_fuse_pass)

if(WITH_GPU OR WITH_ROCM)
if(WITH_GPU
OR WITH_ROCM
OR WITH_MUSA)
cc_test(
test_embedding_eltwise_layernorm_fuse_pass
SRCS embedding_eltwise_layernorm_fuse_pass_tester.cc
Expand Down Expand Up @@ -501,7 +503,9 @@ if(WITH_MKLDNN)
naive_executor
device_context
phi)
if(WITH_GPU OR WITH_ROCM)
if(WITH_GPU
OR WITH_ROCM
OR WITH_MUSA)
set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
endif()
cc_test(
Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ cc_library(
code_generator
SRCS operation.cc code_generator.cc code_generator_helper.cc
DEPS graph subgraph_detector)
if(WITH_GPU OR WITH_ROCM)
if(WITH_GPU
OR WITH_ROCM
OR WITH_MUSA)
cc_test(
test_code_generator
SRCS code_generator_tester.cc
Expand Down
6 changes: 6 additions & 0 deletions paddle/fluid/framework/lod_tensor_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ TEST(LoD, data) {
mix_vector_v.CUDAMutableData(gpu),
v.size());
hipDeviceSynchronize();
#elif defined(PADDLE_WITH_MUSA)
test<<<1, 1>>>(mix_vector_v.CUDAMutableData(gpu), v.size());
musaDeviceSynchronize();
#else
test<<<1, 1>>>(mix_vector_v.CUDAMutableData(gpu), v.size());
cudaDeviceSynchronize();
Expand Down Expand Up @@ -80,6 +83,9 @@ TEST(DenseTensor, LoDInGPU) {
mix_vector.CUDAMutableData(place),
lod[0].size());
hipDeviceSynchronize();
#elif defined(PADDLE_WITH_MUSA)
test<<<1, 8>>>(mix_vector.CUDAMutableData(place), lod[0].size());
musaDeviceSynchronize();
#else
test<<<1, 8>>>(mix_vector.CUDAMutableData(place), lod[0].size());
cudaDeviceSynchronize();
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/framework/new_executor/interpreter_base_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ class InterpreterBaseImpl {
inline void SetDeviceId(const platform::Place& place) {
// TODO(zhiqiu): reduce the cost
if (platform::is_gpu_place(place)) {
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && \
!defined(PADDLE_WITH_MUSA)
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with CUDA support.",
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2050,7 +2050,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
/*For profiling/benchmark only*/
if (FLAGS_benchmark) {
dev_ctx->Wait();
#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM)
#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM) || \
defined(PADLDE_WITH_MUSA)
PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
#endif
VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
Expand Down
12 changes: 12 additions & 0 deletions paddle/fluid/memory/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ if(WITH_ROCM)
DEPS malloc gpu_info place)
endif()

if(WITH_MUSA)
musa_test(malloc_test SRCS malloc_test.cu DEPS device_context malloc)
musa_test(
cuda_managed_memory_test
SRCS
cuda_managed_memory_test.cu
DEPS
malloc
gpu_info
place)
endif()

if(WITH_TESTING AND TEST cuda_managed_memory_test)
set_tests_properties(
cuda_managed_memory_test
Expand Down
Loading

0 comments on commit fc3d741

Please sign in to comment.