diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h index e03235e116f25..55ee5d96d58c5 100644 --- a/paddle/fluid/platform/dynload/nvrtc.h +++ b/paddle/fluid/platform/dynload/nvrtc.h @@ -39,6 +39,8 @@ extern bool HasNVRTC(); __macro(nvrtcCompileProgram); \ __macro(nvrtcCreateProgram); \ __macro(nvrtcDestroyProgram); \ + __macro(nvrtcGetCUBIN); \ + __macro(nvrtcGetCUBINSize); \ __macro(nvrtcGetPTX); \ __macro(nvrtcGetPTXSize); \ __macro(nvrtcGetProgramLog); \ diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc index d160b5034f998..9d657cd1cd077 100644 --- a/paddle/phi/backends/device_code.cc +++ b/paddle/phi/backends/device_code.cc @@ -335,7 +335,7 @@ bool GPUDeviceCode::Compile(bool include_path) { DeviceContextPool::Instance().Get(place_)); int compute_capability = dev_ctx->GetComputeCapability(); std::string compute_flag = - "--gpu-architecture=compute_" + std::to_string(compute_capability); + "--gpu-architecture=sm_" + std::to_string(compute_capability); std::vector options = {"--std=c++11", compute_flag.c_str()}; std::string include_option; if (include_path) { @@ -369,15 +369,15 @@ bool GPUDeviceCode::Compile(bool include_path) { return false; } - // Obtain PTX from the program - size_t ptx_size; - if (!CheckNVRTCResult(dynload::nvrtcGetPTXSize(program, &ptx_size), - "nvrtcGetPTXSize")) { + // Obtain cubin from the program + size_t cubin_size; + if (!CheckNVRTCResult(dynload::nvrtcGetCUBINSize(program, &cubin_size), + "nvrtcGetCUBINSize")) { return false; } - ptx_.resize(ptx_size + 1); - if (!CheckNVRTCResult(dynload::nvrtcGetPTX(program, ptx_.data()), - "nvrtcGetPTX")) { + cubin_.resize(cubin_size + 1); + if (!CheckNVRTCResult(dynload::nvrtcGetCUBIN(program, cubin_.data()), + "nvrtcGetCUBIN")) { return false; } @@ -386,7 +386,7 @@ bool GPUDeviceCode::Compile(bool include_path) { return false; } - if (!CheckCUDADriverResult(dynload::cuModuleLoadData(&module_, ptx_.data()), + if (!CheckCUDADriverResult(dynload::cuModuleLoadData(&module_, cubin_.data()), "cuModuleLoadData", name_)) { return false; diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h index 8debb4dc9c45e..910e1d21073f8 100644 --- a/paddle/phi/backends/device_code.h +++ b/paddle/phi/backends/device_code.h @@ -78,6 +78,7 @@ class GPUDeviceCode : public DeviceCode { int max_threads_{0}; int num_threads_{1024}; int workload_per_thread_{1}; + std::vector cubin_; std::vector ptx_; #ifdef PADDLE_WITH_HIP hipModule_t module_; diff --git a/paddle/phi/backends/dynload/nvrtc.h b/paddle/phi/backends/dynload/nvrtc.h index 9244e9487b250..b602c19ddd059 100644 --- a/paddle/phi/backends/dynload/nvrtc.h +++ b/paddle/phi/backends/dynload/nvrtc.h @@ -51,6 +51,8 @@ extern bool HasNVRTC(); __macro(nvrtcCompileProgram); \ __macro(nvrtcCreateProgram); \ __macro(nvrtcDestroyProgram); \ + __macro(nvrtcGetCUBIN); \ + __macro(nvrtcGetCUBINSize); \ __macro(nvrtcGetPTX); \ __macro(nvrtcGetPTXSize); \ __macro(nvrtcGetProgramLog); \