diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h
index e03235e116f25..55ee5d96d58c5 100644
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -39,6 +39,8 @@ extern bool HasNVRTC();
   __macro(nvrtcCompileProgram);     \
   __macro(nvrtcCreateProgram);      \
   __macro(nvrtcDestroyProgram);     \
+  __macro(nvrtcGetCUBIN);           \
+  __macro(nvrtcGetCUBINSize);       \
   __macro(nvrtcGetPTX);             \
   __macro(nvrtcGetPTXSize);         \
   __macro(nvrtcGetProgramLog);      \
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index d160b5034f998..9d657cd1cd077 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -335,7 +335,7 @@ bool GPUDeviceCode::Compile(bool include_path) {
       DeviceContextPool::Instance().Get(place_));
   int compute_capability = dev_ctx->GetComputeCapability();
   std::string compute_flag =
-      "--gpu-architecture=compute_" + std::to_string(compute_capability);
+      "--gpu-architecture=sm_" + std::to_string(compute_capability);
   std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
   std::string include_option;
   if (include_path) {
@@ -369,15 +369,15 @@ bool GPUDeviceCode::Compile(bool include_path) {
     return false;
   }
 
-  // Obtain PTX from the program
-  size_t ptx_size;
-  if (!CheckNVRTCResult(dynload::nvrtcGetPTXSize(program, &ptx_size),
-                        "nvrtcGetPTXSize")) {
+  // Obtain cubin from the program
+  size_t cubin_size;
+  if (!CheckNVRTCResult(dynload::nvrtcGetCUBINSize(program, &cubin_size),
+                        "nvrtcGetCUBINSize")) {
     return false;
   }
-  ptx_.resize(ptx_size + 1);
-  if (!CheckNVRTCResult(dynload::nvrtcGetPTX(program, ptx_.data()),
-                        "nvrtcGetPTX")) {
+  cubin_.resize(cubin_size + 1);
+  if (!CheckNVRTCResult(dynload::nvrtcGetCUBIN(program, cubin_.data()),
+                        "nvrtcGetCUBIN")) {
     return false;
   }
 
@@ -386,7 +386,7 @@ bool GPUDeviceCode::Compile(bool include_path) {
     return false;
   }
 
-  if (!CheckCUDADriverResult(dynload::cuModuleLoadData(&module_, ptx_.data()),
+  if (!CheckCUDADriverResult(dynload::cuModuleLoadData(&module_, cubin_.data()),
                              "cuModuleLoadData",
                              name_)) {
     return false;
diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h
index 8debb4dc9c45e..910e1d21073f8 100644
--- a/paddle/phi/backends/device_code.h
+++ b/paddle/phi/backends/device_code.h
@@ -78,6 +78,7 @@ class GPUDeviceCode : public DeviceCode {
   int max_threads_{0};
   int num_threads_{1024};
   int workload_per_thread_{1};
+  std::vector<char> cubin_;
   std::vector<char> ptx_;
 #ifdef PADDLE_WITH_HIP
   hipModule_t module_;
diff --git a/paddle/phi/backends/dynload/nvrtc.h b/paddle/phi/backends/dynload/nvrtc.h
index 9244e9487b250..b602c19ddd059 100644
--- a/paddle/phi/backends/dynload/nvrtc.h
+++ b/paddle/phi/backends/dynload/nvrtc.h
@@ -51,6 +51,8 @@ extern bool HasNVRTC();
   __macro(nvrtcCompileProgram);     \
   __macro(nvrtcCreateProgram);      \
   __macro(nvrtcDestroyProgram);     \
+  __macro(nvrtcGetCUBIN);           \
+  __macro(nvrtcGetCUBINSize);       \
   __macro(nvrtcGetPTX);             \
   __macro(nvrtcGetPTXSize);         \
   __macro(nvrtcGetProgramLog);      \