diff --git a/cmake/external/cudnn-frontend.cmake b/cmake/external/cudnn-frontend.cmake index 5ba038d457d54..16c21c8dbf26d 100644 --- a/cmake/external/cudnn-frontend.cmake +++ b/cmake/external/cudnn-frontend.cmake @@ -28,24 +28,24 @@ endif() if((NOT DEFINED CUDNN_FRONTEND_NAME) OR (NOT DEFINED CUDNN_FRONTEND_URL)) set(CUDNN_FRONTEND_VER - "1.23.2" + "v0.9.1" CACHE STRING "" FORCE) set(CUDNN_FRONTEND_NAME "cudnn-frontend" CACHE STRING "" FORCE) set(CUDNN_FRONTEND_URL - "https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.7.1.tar.gz" + "https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/${CUDNN_FRONTEND_VER}.tar.gz" CACHE STRING "" FORCE) - set(CUDNN_FRONTEND_CACHE_FILENAME "v0.7.1.tar.gz") endif() -set(CUDNN_FRONTEND_URL_MD5 "d8f911df571f8b0d40226efa9c0150c8") +set(CUDNN_FRONTEND_CACHE_FILENAME "${CUDNN_FRONTEND_VER}.tar.gz") +set(CUDNN_FRONTEND_URL_MD5 "da7cbad1305427f687dd4fd737178f80") message( STATUS "CUDNN_FRONTEND_NAME: ${CUDNN_FRONTEND_NAME}, CUDNN_FRONTEND_URL: ${CUDNN_FRONTEND_URL}" ) -set(DIRENT_DOWNLOAD_DIR "${PADDLE_SOURCE_DIR}/third_party/cudnn-frontend") -# Version: v0.7.1 +set(CUDNN_FRONTEND_DOWNLOAD_DIR + "${PADDLE_SOURCE_DIR}/third_party/cudnn-frontend") set(CUDNN_FRONTEND_PREFIX_DIR ${THIRD_PARTY_PATH}/cudnn-frontend) set(CUDNN_FRONTEND_SOURCE_DIR ${THIRD_PARTY_PATH}/cudnn-frontend/src/extern_cudnn_frontend/include) @@ -55,7 +55,7 @@ include_directories(${CUDNN_FRONTEND_INCLUDE_DIR}) message( STATUS - "Adding cudnn-frontend. Version: ${CUDNN_FRONTEND_VER}. Directory: ${DIRENT_DOWNLOAD_DIR}" + "Adding cudnn-frontend. Version: ${CUDNN_FRONTEND_VER}. Directory: ${CUDNN_FRONTEND_DOWNLOAD_DIR}" ) function(download_cudnn_frontend) @@ -99,9 +99,7 @@ ExternalProject_Add( DOWNLOAD_DIR ${CUDNN_FRONTEND_DOWNLOAD_DIR} DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" - PATCH_COMMAND - patch -d ${CUDNN_FRONTEND_SOURCE_DIR} -p2 < - ${PADDLE_SOURCE_DIR}/patches/cudnn-frontend/0001-patch-for-paddle.patch + PATCH_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/paddle/phi/kernels/autotune/cache_cudnn_frontend.h b/paddle/phi/kernels/autotune/cache_cudnn_frontend.h index c65e69bdbef4f..095cedccb991c 100644 --- a/paddle/phi/kernels/autotune/cache_cudnn_frontend.h +++ b/paddle/phi/kernels/autotune/cache_cudnn_frontend.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "paddle/phi/backends/dynload/cudnn_frontend.h" @@ -34,7 +35,13 @@ class CudnnFrontendPlanCache { saturation_count_ = FLAGS_cudnn_cache_saturation_count; } - int64_t Size() const { return map_.size(); } + int64_t Size() const { + int64_t total_size = 0; + for (auto it = map_.begin(); it != map_.end(); it++) { + total_size += (it->second).size(); + } + return total_size; + } int64_t CacheHits() const { return cache_hits_; } @@ -58,11 +65,12 @@ class CudnnFrontendPlanCache { cache_misses_ = 0; } - bool FindPlan(const cudnn_frontend::OperationGraph& op_graph, - bool use_addto = false) { + bool FindPlan(const cudnn_frontend::feature_vector_t &feature, + cudnnHandle_t handle) { bool ret = false; std::lock_guard lock(*cache_mutex_); - if (map_.count(MakeKey(op_graph, use_addto)) > 0) { + auto &local_map = map_[hasher(std::this_thread::get_id())]; + if (local_map.count(GetExtendedFeature(feature, handle)) > 0) { cache_hits_++; ret = true; } else { @@ -71,58 +79,98 @@ class CudnnFrontendPlanCache { return ret; } - cudnn_frontend::ManagedOpaqueDescriptor GetConfig( - const cudnn_frontend::OperationGraph& op_graph, - cudnnHandle_t handle, - bool use_addto = false) { + void GetPlan(const cudnn_frontend::feature_vector_t &feature, + const cudnn_frontend::ExecutionPlan **plan, + int64_t *workspace_size, + cudnnHandle_t handle) { + // Note(tizheng): CUDNNv8 execution plan is not thread-safe. + // A shared plan being executed by different threads is + // generally not safe (for now). std::lock_guard lock(*cache_mutex_); - auto engine_config = map_[MakeKey(op_graph, use_addto)]; - return engine_config; + auto &local_map = map_[hasher(std::this_thread::get_id())]; + + auto it = local_map.find(GetExtendedFeature(feature, handle)); + if (it == local_map.end()) { + PADDLE_THROW(phi::errors::InvalidArgument( + "[cudnn_frontend] Cached Plan Not Found.")); + return; + } + *plan = &(it->second); + *workspace_size = (*plan)->getWorkspaceSize(); + VLOG(4) << "Cached execution plan found." << (*plan)->getTag() + << "; Require workspace: " << *workspace_size; } - void InsertPlan(const cudnn_frontend::OperationGraph& op_graph, - const cudnn_frontend::ExecutionPlan& plan, - bool use_addto = false) { - VLOG(4) << "[cudnn_frontend] cache: Insert graph tag: " - << op_graph.getTag(); + void InsertPlan(const cudnn_frontend::feature_vector_t &feature, + const cudnn_frontend::ExecutionPlan &plan, + cudnnHandle_t handle) { + VLOG(4) << "[cudnn_frontend] cache: Insert plan: " << plan.getTag(); std::lock_guard lock(*cache_mutex_); - map_.insert( - std::make_pair(MakeKey(op_graph, use_addto), plan.GetEngineConfig())); + auto &local_map = map_[hasher(std::this_thread::get_id())]; + local_map.insert(std::make_pair(GetExtendedFeature(feature, handle), plan)); } - bool IsStable(const cudnn_frontend::OperationGraph& op_graph, - const std::string& tag, - bool use_addto = false) { + bool IsStable(const cudnn_frontend::feature_vector_t &feature, + const std::string &tag, + cudnnHandle_t handle) { if (saturation_count_ == 1) { return true; } std::lock_guard lock(*cache_mutex_); - if (map_.count(MakeKey(op_graph, use_addto))) { + auto &local_map = map_[hasher(std::this_thread::get_id())]; + auto &local_tracker = tracker_[hasher(std::this_thread::get_id())]; + auto ext_feature = GetExtendedFeature(feature, handle); + if (local_map.count(ext_feature)) { return false; } - int cnt = tracker_[std::make_pair(MakeKey(op_graph, use_addto), tag)] += 1; - VLOG(4) << "[cudnn_frontend] SaturationTracker: " << op_graph.getTag() - << " " << tag << " " << cnt; + int cnt = local_tracker[std::make_pair(ext_feature, tag)] += 1; + VLOG(4) << "[cudnn_frontend] SaturationTracker: " << tag << " " << cnt; return cnt >= saturation_count_; } + bool FindPlan(const cudnn_frontend::OperationGraph &op_graph, + cudnnHandle_t handle) { + return FindPlan(op_graph.getFeatureVector(), handle); + } + + void GetPlan(const cudnn_frontend::OperationGraph &op_graph, + const cudnn_frontend::ExecutionPlan **plan, + int64_t *workspace_size, + cudnnHandle_t handle) { + GetPlan(op_graph.getFeatureVector(), plan, workspace_size, handle); + } + + void InsertPlan(const cudnn_frontend::OperationGraph &op_graph, + const cudnn_frontend::ExecutionPlan &plan, + cudnnHandle_t handle) { + InsertPlan(op_graph.getFeatureVector(), plan, handle); + } + + bool IsStable(const cudnn_frontend::OperationGraph &op_graph, + const std::string &tag, + cudnnHandle_t handle) { + return IsStable(op_graph.getFeatureVector(), tag, handle); + } + private: - static cudnn_frontend::feature_vector_t MakeKey( - const cudnn_frontend::OperationGraph& op_graph, bool use_addto) { - auto key = op_graph.getFeatureVector(); - key.push_back(static_cast(use_addto)); - return key; + cudnn_frontend::feature_vector_t GetExtendedFeature( + cudnn_frontend::feature_vector_t feat, cudnnHandle_t handle) { + int64_t val = 0; + memcpy(&val, &handle, sizeof(int64_t)); + feat.push_back(val); + return feat; } + using FeatureVectorToPlanMap = + std::map; + std::map map_; + std::hash hasher; - std::map - map_; std::shared_ptr cache_mutex_; int saturation_count_; using SaturationTracker = std::map, int>; - SaturationTracker tracker_; + std::map tracker_; int64_t cache_hits_{0}; int64_t cache_misses_{0}; diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h index e53d5783048ff..ef8e606e547ce 100644 --- a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h +++ b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" +#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h" namespace phi { @@ -102,6 +103,33 @@ class CudnnFrontendConvHelper { .build(); } + static inline cudnn_frontend::Tensor GetGeneralTensorDescriptor( + std::vector dims, + cudnnTensorFormat_t layout, + int64_t id, + int64_t alignment, + cudnnDataType_t dtype, + bool is_virtual = false, + int64_t group_count = 0) { + std::vector strides = GenerateStrides(dims, layout); + if (group_count > 0) { + int64_t c_per_group = dims[1]; + int64_t c_stride = strides[1]; + dims.insert(dims.begin() + 1, group_count); + strides.insert(strides.begin() + 1, c_stride * c_per_group); + } + cudnn_frontend::TensorBuilder builder; + builder.setDim(dims.size(), dims.data()) + .setStride(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(dtype); + if (is_virtual) { + builder.setVirtual(); + } + return builder.build(); + } + static cudnn_frontend::ConvDesc_v8 GetConvDescriptor( cudnnDataType_t dataType, const std::vector& padding, @@ -157,44 +185,26 @@ class CudnnFrontendConvHelper { cudnn_frontend::OperationGraph* op_graph_pointer, bool exhaustive_search, bool deterministic, - void* x_data, - void* y_data, - void* w_data, + std::vector* data_ptrs, + std::vector* uids, cudnnHandle_t handle, phi::DnnWorkspaceHandle* workspace_handle) { auto heurgen_method = [=](cudnn_frontend::OperationGraph& op_graph_) -> cudnn_frontend::EngineConfigList { - auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() - .setOperationGraph(op_graph_) - .setHeurMode(CUDNN_HEUR_MODE_INSTANT) - .build(); - VLOG(4) << "Heuristic has " << heuristics.getEngineConfigCount() - << " configurations "; - - auto& engine_configs = - heuristics.getEngineConfig(heuristics.getEngineConfigCount()); - cudnn_frontend::EngineConfigList filtered_configs; - cudnn_frontend::filter(engine_configs, - filtered_configs, - deterministic ? IsNonDeterministic : AllowAll); - return filtered_configs; - }; - - auto fallback_method = [=](cudnn_frontend::OperationGraph& op_graph_) - -> cudnn_frontend::EngineConfigList { - auto fallback = cudnn_frontend::EngineFallbackListBuilder() - .setOperationGraph(op_graph_) - .build(); - auto& fallback_list = fallback.getFallbackList(); cudnn_frontend::EngineConfigList filtered_configs; - cudnn_frontend::filter(fallback_list, - filtered_configs, - deterministic ? IsNonDeterministic : AllowAll); + auto statuses = cudnn_frontend::get_heuristics_list<2>( + {"heuristics_instant", "heuristics_fallback"}, + op_graph_, + deterministic ? IsNonDeterministic : AllowAll, + filtered_configs, + true); + VLOG(6) << "Filter config list has " << filtered_configs.size() + << " configurations "; return filtered_configs; }; - std::array sources = { - heurgen_method, fallback_method}; + std::array sources = { + heurgen_method}; cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); @@ -204,30 +214,19 @@ class CudnnFrontendConvHelper { [=](cudnn_frontend::ExecutionPlan const& plan) -> bool { return plan.getWorkspaceSize() > workspace_size_limit; }; - - auto plans = - generator.cudnnGetPlan(handle, *op_graph_pointer, predicate_function); - + VLOG(6) << "[cudnn_frontend] Max workspace size: " << workspace_size_limit; + cudnn_frontend::executionPlans_t plans; bool use_autotune = phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); if (!deterministic && (exhaustive_search || use_autotune)) { - size_t workspace_size_max = 0; - std::for_each( - plans.begin(), plans.end(), [&](cudnn_frontend::ExecutionPlan& opt) { - if (opt.getWorkspaceSize() > workspace_size_max) { - workspace_size_max = opt.getWorkspaceSize(); - } - }); - VLOG(6) << "[cudnn_frontend] Max workspace size: " << workspace_size_max; workspace_handle->RunFunc( [&](void* workspace_ptr) { - void* data_ptrs[] = {x_data, y_data, w_data}; - int64_t uids[] = {'x', 'y', 'w'}; - auto variant_pack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(workspace_ptr) - .setDataPointers(3, data_ptrs) - .setUids(3, uids) - .build(); + auto variant_pack = + cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(data_ptrs->size(), data_ptrs->data()) + .setUids(uids->size(), uids->data()) + .build(); plans = generator .cudnnFindPlan data_ptrs({x_data, y_data, w_data}); + std::vector uids({'x', 'y', 'w'}); + return FindExecutionPlans(op_graph_pointer, + exhaustive_search, + deterministic, + &data_ptrs, + &uids, + handle, + workspace_handle); + } + + static void ExecutePlan(cudnnHandle_t handle_, + phi::DnnWorkspaceHandle* workspace_handle, + std::vector* data_ptrs, + std::vector* uids, + cudnnBackendDescriptor_t plan_desc, + int64_t workspace_size) { + workspace_handle->RunFunc( + [&](void* workspace_ptr) { + auto variant_pack = + cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(data_ptrs->size(), data_ptrs->data()) + .setUids(uids->size(), uids->data()) + .build(); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( + handle_, plan_desc, variant_pack.get_raw_desc())); + }, + workspace_size); + } + + static void ExecutePlan(cudnnHandle_t handle_, + phi::DnnWorkspaceHandle* workspace_handle, + void* x_data, + void* y_data, + void* w_data, + cudnnBackendDescriptor_t plan_desc, + int64_t workspace_size) { + std::vector data_ptrs({x_data, y_data, w_data}); + std::vector uids({'x', 'y', 'w'}); + ExecutePlan(handle_, + workspace_handle, + &data_ptrs, + &uids, + plan_desc, + workspace_size); + } + + static void ExecutePlansAndCache( + cudnnHandle_t handle_, + phi::DnnWorkspaceHandle* workspace_handle, + std::vector* data_ptrs, + std::vector* uids, + cudnn_frontend::executionPlans_t* plans, + bool exhaustive_search, + const cudnn_frontend::feature_vector_t& feature_vector, + phi::autotune::CudnnFrontendPlanCache* plan_cache) { + for (auto& plan : *plans) { + try { + ExecutePlan(handle_, + workspace_handle, + data_ptrs, + uids, + plan.get_raw_desc(), + plan.getWorkspaceSize()); + if (!exhaustive_search || + plan_cache->IsStable(feature_vector, plan.getTag(), handle_)) { + plan_cache->InsertPlan(feature_vector, plan, handle_); + } + return; + } catch (cudnn_frontend::cudnnException& e) { + VLOG(4) << "Plan " << plan.describe() + << "failed to execute. Trying next plan."; + } catch (phi::enforce::EnforceNotMet& e) { + VLOG(4) << "Plan " << plan.describe() + << "failed to execute. Trying next plan."; + } + } + PADDLE_THROW(phi::errors::InvalidArgument( + "[CUDNN Frontend API] No valid plan could " + "be found to execute. Try setting FLAGS_conv_workspace_size_limit " + "higher.")); + } + + static void ExecutePlansAndCache( + cudnnHandle_t handle_, + phi::DnnWorkspaceHandle* workspace_handle, + void* x_data, + void* y_data, + void* w_data, + cudnn_frontend::executionPlans_t* plans, + bool exhaustive_search, + const cudnn_frontend::OperationGraph& op_graph, + phi::autotune::CudnnFrontendPlanCache* plan_cache) { + std::vector data_ptrs({x_data, y_data, w_data}); + std::vector uids({'x', 'y', 'w'}); + ExecutePlansAndCache(handle_, + workspace_handle, + &data_ptrs, + &uids, + plans, + exhaustive_search, + op_graph.getFeatureVector(), + plan_cache); + } + + static cudnn_frontend::Operation MakePointwiseOp( + cudnnPointwiseMode_t mode, + cudnnDataType_t dtype, + cudnn_frontend::Tensor const& x_desc, + cudnn_frontend::Tensor const& b_desc, + cudnn_frontend::Tensor const& y_desc, + float alpha1 = 1.0, + float alpha2 = 1.0) { + auto op_desc = cudnn_frontend::PointWiseDescBuilder() + .setMode(mode) + .setComputeType(dtype) + .build(); + auto op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(x_desc) + .setbDesc(b_desc) + .setyDesc(y_desc) + .setpwDesc(op_desc) + .setAlpha(alpha1) + .setAlpha2(alpha2) + .build(); + VLOG(6) << op.describe(); + return op; + } }; // class CudnnFrontendConvHelper template @@ -290,29 +432,18 @@ void CudnnConvBwdDataV8(const DenseTensor* dy_tensor, alpha, beta); - if (plan_cache_bwd_data.FindPlan(op_graph, use_addto)) { - auto engine_config = - plan_cache_bwd_data.GetConfig(op_graph, handle, use_addto); - auto cached_plan = cudnn_frontend::ExecutionPlanBuilder() - .setHandle(handle) - .setEngineConfig(engine_config, op_graph.getTag()) - .build(); - auto workspace_size = cached_plan.getWorkspaceSize(); - VLOG(4) << "Cached execution plan found." << cached_plan.getTag() - << "; Require workspace: " << workspace_size; - workspace_handle->RunFunc( - [&](void* workspace_ptr) { - void* data_ptrs[] = {dx_tensor_data, dy_tensor_data, w_tensor_data}; - int64_t uids[] = {'x', 'y', 'w'}; - auto variant_pack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(workspace_ptr) - .setDataPointers(3, data_ptrs) - .setUids(3, uids) - .build(); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( - handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc())); - }, - workspace_size); + if (plan_cache_bwd_data.FindPlan(op_graph, handle)) { + const cudnn_frontend::ExecutionPlan* cached_plan = nullptr; + int64_t workspace_size = 0; + plan_cache_bwd_data.GetPlan( + op_graph, &cached_plan, &workspace_size, handle); + helper::ExecutePlan(handle, + workspace_handle, + dx_tensor_data, + dy_tensor_data, + w_tensor_data, + cached_plan->get_raw_desc(), + workspace_size); return; } @@ -325,34 +456,15 @@ void CudnnConvBwdDataV8(const DenseTensor* dy_tensor, handle, workspace_handle); - for (auto& plan : plans) { - try { - int64_t workspace_size = plan.getWorkspaceSize(); - workspace_handle->RunFunc( - [&](void* workspace_ptr) { - void* data_ptrs[] = {dx_tensor_data, dy_tensor_data, w_tensor_data}; - int64_t uids[] = {'x', 'y', 'w'}; - auto variant_pack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(workspace_ptr) - .setDataPointers(3, data_ptrs) - .setUids(3, uids) - .build(); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( - handle, plan.get_raw_desc(), variant_pack.get_raw_desc())); - }, - workspace_size); - if (!exhaustive_search || - plan_cache_bwd_data.IsStable(op_graph, plan.getTag(), use_addto)) { - plan_cache_bwd_data.InsertPlan(op_graph, plan, use_addto); - } - return; - } catch (cudnn_frontend::cudnnException& e) { - } catch (phi::enforce::EnforceNotMet& e) { - } - } - PADDLE_THROW( - phi::errors::InvalidArgument("[CUDNN Frontend API] No valid plan could " - "be found to execute conv backward data.")); + helper::ExecutePlansAndCache(handle, + workspace_handle, + dx_tensor_data, + dy_tensor_data, + w_tensor_data, + &plans, + exhaustive_search, + op_graph, + &plan_cache_bwd_data); } template @@ -394,28 +506,18 @@ void CudnnConvBwdFilterV8(const DenseTensor* x_tensor, alpha, beta); - if (plan_cache_bwd_filter.FindPlan(op_graph)) { - auto engine_config = plan_cache_bwd_filter.GetConfig(op_graph, handle); - auto cached_plan = cudnn_frontend::ExecutionPlanBuilder() - .setHandle(handle) - .setEngineConfig(engine_config, op_graph.getTag()) - .build(); - auto workspace_size = cached_plan.getWorkspaceSize(); - VLOG(4) << "Cached execution plan found." << cached_plan.getTag() - << "; Require workspace: " << workspace_size; - workspace_handle->RunFunc( - [&](void* workspace_ptr) { - void* data_ptrs[] = {x_tensor_data, dy_tensor_data, dw_tensor_data}; - int64_t uids[] = {'x', 'y', 'w'}; - auto variant_pack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(workspace_ptr) - .setDataPointers(3, data_ptrs) - .setUids(3, uids) - .build(); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( - handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc())); - }, - workspace_size); + if (plan_cache_bwd_filter.FindPlan(op_graph, handle)) { + const cudnn_frontend::ExecutionPlan* cached_plan = nullptr; + int64_t workspace_size = 0; + plan_cache_bwd_filter.GetPlan( + op_graph, &cached_plan, &workspace_size, handle); + helper::ExecutePlan(handle, + workspace_handle, + x_tensor_data, + dy_tensor_data, + dw_tensor_data, + cached_plan->get_raw_desc(), + workspace_size); return; } @@ -428,39 +530,15 @@ void CudnnConvBwdFilterV8(const DenseTensor* x_tensor, handle, workspace_handle); - for (auto& plan : plans) { - try { - int64_t workspace_size = plan.getWorkspaceSize(); - workspace_handle->RunFunc( - [&](void* workspace_ptr) { - void* data_ptrs[] = {x_tensor_data, dy_tensor_data, dw_tensor_data}; - int64_t uids[] = {'x', 'y', 'w'}; - auto variant_pack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(workspace_ptr) - .setDataPointers(3, data_ptrs) - .setUids(3, uids) - .build(); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( - handle, plan.get_raw_desc(), variant_pack.get_raw_desc())); - }, - workspace_size); - if (!exhaustive_search || - plan_cache_bwd_filter.IsStable(op_graph, plan.getTag())) { - plan_cache_bwd_filter.InsertPlan(op_graph, plan); - } - return; - } catch (cudnn_frontend::cudnnException& e) { - VLOG(4) << "Plan " << plan.describe() - << "failed to execute. Trying next plan."; - } catch (phi::enforce::EnforceNotMet& e) { - VLOG(4) << "Plan " << plan.describe() - << "failed to execute. Trying next plan."; - } - } - - PADDLE_THROW(phi::errors::InvalidArgument( - "[CUDNN Frontend API] No valid plan could " - "be found to execute conv backward filter.")); + helper::ExecutePlansAndCache(handle, + workspace_handle, + x_tensor_data, + dy_tensor_data, + dw_tensor_data, + &plans, + exhaustive_search, + op_graph, + &plan_cache_bwd_filter); } } // namespace phi diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index 15161dd61c697..6dc7fc9e6131d 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -261,28 +261,17 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor, alpha, beta); - if (plan_cache.FindPlan(op_graph)) { - auto engine_config = plan_cache.GetConfig(op_graph, handle); - auto cached_plan = cudnn_frontend::ExecutionPlanBuilder() - .setHandle(handle) - .setEngineConfig(engine_config, op_graph.getTag()) - .build(); - auto workspace_size = cached_plan.getWorkspaceSize(); - VLOG(4) << "Cached execution plan found." << cached_plan.getTag() - << "; Require workspace: " << workspace_size; - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - void* data_ptrs[] = {input_data, output_data, filter_data}; - int64_t uids[] = {'x', 'y', 'w'}; - auto variant_pack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(workspace_ptr) - .setDataPointers(3, data_ptrs) - .setUids(3, uids) - .build(); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( - handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc())); - }, - workspace_size); + if (plan_cache.FindPlan(op_graph, handle)) { + const cudnn_frontend::ExecutionPlan* cached_plan = nullptr; + int64_t workspace_size = 0; + plan_cache.GetPlan(op_graph, &cached_plan, &workspace_size, handle); + helper::ExecutePlan(handle, + &workspace_handle, + input_data, + output_data, + filter_data, + cached_plan->get_raw_desc(), + workspace_size); return; } @@ -295,37 +284,15 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor, handle, &workspace_handle); - for (auto& plan : plans) { - try { - int64_t workspace_size = plan.getWorkspaceSize(); - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - void* data_ptrs[] = {input_data, output_data, filter_data}; - int64_t uids[] = {'x', 'y', 'w'}; - auto variant_pack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(workspace_ptr) - .setDataPointers(3, data_ptrs) - .setUids(3, uids) - .build(); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( - handle, plan.get_raw_desc(), variant_pack.get_raw_desc())); - }, - workspace_size); - if (!exhaustive_search || plan_cache.IsStable(op_graph, plan.getTag())) { - plan_cache.InsertPlan(op_graph, plan); - } - return; - } catch (cudnn_frontend::cudnnException& e) { - VLOG(4) << "Plan " << plan.describe() - << "failed to execute. Trying next plan."; - } catch (phi::enforce::EnforceNotMet& e) { - VLOG(4) << "Plan " << plan.describe() - << "failed to execute. Trying next plan."; - } - } - PADDLE_THROW( - phi::errors::InvalidArgument("[CUDNN Frontend API] No valid plan could " - "be found to execute conv.")); + helper::ExecutePlansAndCache(handle, + &workspace_handle, + input_data, + output_data, + filter_data, + &plans, + exhaustive_search, + op_graph, + &plan_cache); } #endif diff --git a/patches/cudnn-frontend/0001-patch-for-paddle.patch b/patches/cudnn-frontend/0001-patch-for-paddle.patch deleted file mode 100644 index bf5288f06eea2..0000000000000 --- a/patches/cudnn-frontend/0001-patch-for-paddle.patch +++ /dev/null @@ -1,137 +0,0 @@ -From dce3465da518641ee177187fbc0c0d36faea28f2 Mon Sep 17 00:00:00 2001 -From: Tian Zheng -Date: Thu, 27 Oct 2022 20:33:16 -0700 -Subject: [PATCH] patch for paddle - ---- - include/cudnn_frontend_ExecutionPlan.h | 10 +++++++--- - include/cudnn_frontend_ExecutionPlanCache.h | 2 +- - include/cudnn_frontend_OperationGraph.h | 2 +- - include/cudnn_frontend_find_plan.h | 6 +++--- - include/cudnn_frontend_get_plan.h | 4 ++-- - 5 files changed, 14 insertions(+), 10 deletions(-) - -diff --git a/include/cudnn_frontend_ExecutionPlan.h b/include/cudnn_frontend_ExecutionPlan.h -index 7bed4b4..3314b5c 100644 ---- a/include/cudnn_frontend_ExecutionPlan.h -+++ b/include/cudnn_frontend_ExecutionPlan.h -@@ -167,6 +167,10 @@ class ExecutionPlan_v8 : public BackendDescriptor { - return json_string; - #endif - } -+ -+ ManagedOpaqueDescriptor GetEngineConfig() const { -+ return engine_config; -+ } - - ExecutionPlan_v8(ExecutionPlan_v8 const &) = default; - ExecutionPlan_v8 & -@@ -182,7 +186,7 @@ class ExecutionPlan_v8 : public BackendDescriptor { - CUDNN_TYPE_NUMERICAL_NOTE, - CUDNN_NUMERICAL_NOTE_TYPE_COUNT, - &elem_count, -- NULL); -+ nullptr); - numeric_notes_vec.resize(elem_count); - status = cudnnBackendGetAttribute(extractedEngine_, - CUDNN_ATTR_ENGINE_NUMERICAL_NOTE, -@@ -206,7 +210,7 @@ class ExecutionPlan_v8 : public BackendDescriptor { - CUDNN_TYPE_BEHAVIOR_NOTE, - CUDNN_BEHAVIOR_NOTE_TYPE_COUNT, - &elem_count, -- NULL); -+ nullptr); - behavior_notes_vec.resize(elem_count); - status = cudnnBackendGetAttribute(extractedEngine_, - CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE, -@@ -310,7 +314,7 @@ class ExecutionPlan_v8 : public BackendDescriptor { - CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE, - CUDNN_TYPE_INT64, - 1, -- NULL, -+ nullptr, - &workSpaceSize); - if (status != CUDNN_STATUS_SUCCESS) { - set_error_and_throw_exception(this, -diff --git a/include/cudnn_frontend_ExecutionPlanCache.h b/include/cudnn_frontend_ExecutionPlanCache.h -index 99a157c..741c490 100644 ---- a/include/cudnn_frontend_ExecutionPlanCache.h -+++ b/include/cudnn_frontend_ExecutionPlanCache.h -@@ -94,7 +94,7 @@ class ExecutionPlanCache_v1 { - - /// String to map of feature_vector to execution plan - /// For a given FeatureVector of type T according to the Operation Graph, we get the plan. -- using FeatureVectorToPlanMap = std::map; -+ using FeatureVectorToPlanMap = std::map; - FeatureVectorToPlanMap cache; - - mutable std::mutex cache_mutex; -diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h -index 1478ce8..7894080 100644 ---- a/include/cudnn_frontend_OperationGraph.h -+++ b/include/cudnn_frontend_OperationGraph.h -@@ -78,7 +78,7 @@ class OperationGraph_v8 : public BackendDescriptor { - CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT, - CUDNN_TYPE_INT64, - 1, -- NULL, -+ nullptr, - &global_count); - if (status != CUDNN_STATUS_SUCCESS) { - set_error_and_throw_exception(this, -diff --git a/include/cudnn_frontend_find_plan.h b/include/cudnn_frontend_find_plan.h -index 02a08a1..5f94e45 100644 ---- a/include/cudnn_frontend_find_plan.h -+++ b/include/cudnn_frontend_find_plan.h -@@ -53,7 +53,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const - cudaDeviceSynchronize(); - - cudaStream_t stream = nullptr; -- ::cudnnGetStream(handle, &stream); -+ cudnnGetStream(handle, &stream); - - for (auto &plan : plans) { - float time_ms = 0.0f; -@@ -61,7 +61,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const - float min_time_ms = std::numeric_limits::max(); - - // Warm-up run -- auto warmup_status = ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); -+ auto warmup_status = cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); - if (warmup_status != CUDNN_STATUS_SUCCESS) { - getLogger() << "[cudnn_frontend] Plan " << plan.getTag() << " failed with " << to_string(warmup_status) << std::endl; - continue; -@@ -71,7 +71,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const - for (int i = 0; i < maxIterCount; i++) { - cudaEventRecord(start, stream); - -- ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); -+ cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); - - cudaEventRecord(stop, stream); - cudaEventSynchronize(stop); -diff --git a/include/cudnn_frontend_get_plan.h b/include/cudnn_frontend_get_plan.h -index 50535ab..c43eec9 100644 ---- a/include/cudnn_frontend_get_plan.h -+++ b/include/cudnn_frontend_get_plan.h -@@ -26,7 +26,7 @@ - - namespace cudnn_frontend { - --auto -+inline auto - EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph) - -> executionPlans_t { - // Creating a set of execution plans that are supported. -@@ -47,7 +47,7 @@ EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGra - return plans; - } - --auto -+inline auto - EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph, Predicate pred) - -> executionPlans_t { - // Creating a set of execution plans that are supported. --- -2.25.1 - diff --git a/test/legacy_test/test_switch_autotune.py b/test/legacy_test/test_switch_autotune.py index 69adf7246e7a8..92ca789d8b074 100644 --- a/test/legacy_test/test_switch_autotune.py +++ b/test/legacy_test/test_switch_autotune.py @@ -141,6 +141,10 @@ def run_program(self, enable_autotune): exe.run(startup_program) x = np.random.random(size=data_shape).astype('float32') + # Node(tizheng): warmup run to make sure the following runs + # are in the same thread. Necessary for CUDNNv8 tests + exe.run(program=main_program, feed={'X': x}, fetch_list=[loss]) + self.set_flags(enable_autotune) if enable_autotune: config = {"kernel": {"enable": True, "tuning_range": [1, 2]}}