Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PHI decoupling] move cuda_graph from fluid to phi #48686

Merged
merged 5 commits into from
Dec 8, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ if(WITH_GPU OR WITH_ROCM)
endif()

if(WITH_GPU)
list(APPEND ALLOCATOR_DEPS cuda_graph)
list(APPEND ALLOCATOR_DEPS phi_backends)
endif()

if(CUDA_VERSION VERSION_GREATER_EQUAL 10.2)
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ if(WITH_GPU)
nv_library(
cuda_graph_with_memory_pool
SRCS cuda_graph_with_memory_pool.cc
DEPS device_context allocator cuda_graph)
DEPS device_context allocator phi_backends)
else()
cc_library(
cuda_graph_with_memory_pool
Expand Down
4 changes: 0 additions & 4 deletions paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
nv_library(
cuda_graph
SRCS cuda_graph.cc
DEPS enforce)
nv_library(
cuda_profiler
SRCS cuda_profiler.cc
Expand Down
224 changes: 8 additions & 216 deletions paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,45 +14,23 @@

#pragma once

#include <atomic>
#include <functional>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>

#include "cuda.h" // NOLINT
#include "cuda_runtime.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/utils/optional.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"

namespace paddle {
namespace platform {

using CUDAKernelParams = phi::backends::gpu::CUDAKernelParams;
#if CUDA_VERSION < 10010
using cudaStreamCaptureMode = phi::backends::gpu::cudaStreamCaptureMode;
#endif
using CUDAGraph = phi::backends::gpu::CUDAGraph;
using CUDAGraphCaptureModeGuard = phi::backends::gpu::CUDAGraphCaptureModeGuard;

template <typename T>
static bool IsBitwiseEqual(const T &x, const T &y) {
return std::memcmp(&x, &y, sizeof(T)) == 0;
}

class CUDAKernelParams {
public:
explicit CUDAKernelParams(const cudaKernelNodeParams *params)
: params_(params) {}

const void *func() const { return params_->func; }

template <typename T>
T &As(size_t idx) const {
return *reinterpret_cast<T *>(params_->kernelParams[idx]);
}

private:
const cudaKernelNodeParams *params_;
};

template <typename F, F f>
struct IsSameKernelHelper;

Expand Down Expand Up @@ -96,191 +74,5 @@ struct IsSameKernelHelper<Return (*)(FuncArgs...), kernel_fn> {
}
};

#if CUDA_VERSION >= 10010
static void ThrowErrorIfNotSupportCUDAGraph() {}
#else
enum cudaStreamCaptureMode {
cudaStreamCaptureModeGlobal = 0,
cudaStreamCaptureModeThreadLocal = 1,
cudaStreamCaptureModeRelaxed = 2
};
static void ThrowErrorIfNotSupportCUDAGraph() {
PADDLE_THROW(platform::errors::Unimplemented(
"CUDA Graph is only supported when CUDA version >= 10.1"));
}
#endif

// NOTE: Currently, we do not support to capture CUDA graph in parallel
// NOTE: Do not use this class directly because it should be used with
// the memory pool.
class CUDAGraph {
DISABLE_COPY_AND_ASSIGN(CUDAGraph);

// Since the constructor would throw error is CUDA_VERSION < 10010.
// The non-static method of CUDAGraph need not check CUDA_VERSION
// again.
CUDAGraph() {
ThrowErrorIfNotSupportCUDAGraph();
id_ = UniqueID();
}

public:
static constexpr int64_t kDefaultPoolID = 0;
static constexpr int64_t kInvalidPoolID = -1;

~CUDAGraph() { Reset(); }

CUDAGraphID ID() const { return id_; }

static int64_t SetMemoryPoolID(int64_t pool_id) {
auto &pool_id_ = capturing_graph_->pool_id_;
PADDLE_ENFORCE_EQ(
pool_id_,
kInvalidPoolID,
phi::errors::InvalidArgument("Cannot reset memory pool id twice, the "
"former memory pool id is %d.",
pool_id_));
if (pool_id <= kInvalidPoolID) {
pool_id_ = UniqueMemoryPoolID();
} else {
PADDLE_ENFORCE_GE(
pool_id,
kDefaultPoolID,
phi::errors::InvalidArgument("Invalid memory pool id %d.", pool_id));
pool_id_ = pool_id;
}
return pool_id_;
}

int64_t PoolID() const { return pool_id_; }

static int64_t CapturingPoolID() { return capturing_graph_->pool_id_; }

void Replay();

void Reset();

void AddResetCallback(std::function<void()> callback) {
std::lock_guard<std::mutex> guard(mtx_);
callbacks_.push_back(std::move(callback));
}

void PrintToDotFiles(const std::string &dirname, unsigned int flags);

static void BeginCapture(platform::CUDAPlace place,
cudaStream_t stream,
cudaStreamCaptureMode mode);
static std::unique_ptr<CUDAGraph> EndCapture();

static void BeginSegmentCapture();
static void EndSegmentCapture();

static void AddResetCallbackDuringCapturing(std::function<void()> callback) {
capturing_graph_->AddResetCallback(std::move(callback));
}

// No need to add CUDA_VERSION macro because capturing_graph_ would
// always be nullptr (constructor throws error)
static bool IsCapturing() { return capturing_graph_ != nullptr; }

static CUDAGraphID CapturingID() { return capturing_graph_->id_; }

static platform::CUDAPlace CapturingPlace() {
return capturing_graph_->place_;
}

// This API can be used to debug which GPU operation is not
// supported during capturing CUDA Graph.
static bool IsValidCapturing();

static bool IsThreadLocalCapturing() {
#if CUDA_VERSION >= 10010
return IsCapturing() &&
capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
#else
return false;
#endif
}

static bool IsThisThreadCapturing() {
if (UNLIKELY(IsCapturing())) {
return IsThreadLocalCapturing()
? capturing_thread_id_.get() == std::this_thread::get_id()
: true;
} else {
return false;
}
}

using SetSeedFunc = std::function<bool(CUDAKernelParams *, bool)>;
static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) {
std::lock_guard<std::mutex> guard(capturing_graph_->func_mtx_);
capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func));
}

static int64_t UniqueMemoryPoolID();

private:
static CUDAGraphID UniqueID();

private:
#if CUDA_VERSION >= 10010
std::vector<cudaGraph_t> graphs_;
std::vector<cudaGraphExec_t> exec_graphs_;
cudaStreamCaptureMode capture_mode_;
#endif
cudaStream_t stream_{nullptr};
platform::CUDAPlace place_;
CUDAGraphID id_;
int64_t pool_id_{kInvalidPoolID};
std::vector<std::function<void()>> callbacks_;
bool is_reset_{false};
std::mutex mtx_;

std::vector<SetSeedFunc> set_seed_funcs_;
std::vector<std::vector<std::function<void(cudaGraphExec_t)>>> pre_hooks_;
std::mutex func_mtx_;

bool is_first_run_{true};

static paddle::optional<std::thread::id> capturing_thread_id_;
static std::unique_ptr<CUDAGraph> capturing_graph_;
};

#if CUDA_VERSION >= 10010
class CUDAGraphCaptureModeGuard {
DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);

public:
explicit CUDAGraphCaptureModeGuard(
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
if (UNLIKELY(CUDAGraph::IsCapturing())) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
// After cudaThreadExchangeStreamCaptureMode is called,
// the variable "mode" would be set to the old capturing mode.
old_mode_ = mode;
}
}

~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW {
if (UNLIKELY(CUDAGraph::IsCapturing())) {
PADDLE_ENFORCE_GPU_SUCCESS(
cudaThreadExchangeStreamCaptureMode(&old_mode_));
}
}

private:
cudaStreamCaptureMode old_mode_;
};
#else
class CUDAGraphCaptureModeGuard {
DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);

public:
explicit CUDAGraphCaptureModeGuard(
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
};
#endif

} // namespace platform
} // namespace paddle
2 changes: 1 addition & 1 deletion paddle/fluid/platform/device/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ cc_library(
xpulib
device_context
op_kernel_type
phi_xpu_op_list)
phi_backends)
cc_library(
xpu_resource_pool
SRCS xpu_resource_pool.cc
Expand Down
6 changes: 3 additions & 3 deletions paddle/phi/backends/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ if(WITH_GPU OR WITH_ROCM)
list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
gpu/gpu_resources.cc)
if(WITH_GPU)
list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc)
list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc gpu/cuda/cuda_graph.cc)
endif()
if(WITH_ROCM)
list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
Expand All @@ -16,8 +16,8 @@ if(WITH_GPU OR WITH_ROCM)
endif()

if(WITH_XPU)
add_subdirectory(xpu)
list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc)
list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc
xpu/xpu_op_list.cc)
endif()

if(WITH_MKLDNN)
Expand Down
Loading