Skip to content

Commit

Permalink
[PHI decoupling] move cuda_graph from fluid to phi (#48686)
Browse files Browse the repository at this point in the history
* move cuda_graph from fluid to phi

* move device_memory_aligment from fluid to phi

* Revert "move device_memory_aligment from fluid to phi"

This reverts commit b92fcd3.

* update xpu cmake
  • Loading branch information
huangjiyi authored Dec 8, 2022
1 parent 91ff207 commit a4d9851
Show file tree
Hide file tree
Showing 12 changed files with 291 additions and 261 deletions.
2 changes: 1 addition & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ if(WITH_GPU OR WITH_ROCM)
endif()

if(WITH_GPU)
list(APPEND ALLOCATOR_DEPS cuda_graph)
list(APPEND ALLOCATOR_DEPS phi_backends)
endif()

if(CUDA_VERSION VERSION_GREATER_EQUAL 10.2)
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ if(WITH_GPU)
nv_library(
cuda_graph_with_memory_pool
SRCS cuda_graph_with_memory_pool.cc
DEPS device_context allocator cuda_graph)
DEPS device_context allocator phi_backends)
else()
cc_library(
cuda_graph_with_memory_pool
Expand Down
4 changes: 0 additions & 4 deletions paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
nv_library(
cuda_graph
SRCS cuda_graph.cc
DEPS enforce)
nv_library(
cuda_profiler
SRCS cuda_profiler.cc
Expand Down
224 changes: 8 additions & 216 deletions paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,45 +14,23 @@

#pragma once

#include <atomic>
#include <functional>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>

#include "cuda.h" // NOLINT
#include "cuda_runtime.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/utils/optional.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"

namespace paddle {
namespace platform {

using CUDAKernelParams = phi::backends::gpu::CUDAKernelParams;
#if CUDA_VERSION < 10010
using cudaStreamCaptureMode = phi::backends::gpu::cudaStreamCaptureMode;
#endif
using CUDAGraph = phi::backends::gpu::CUDAGraph;
using CUDAGraphCaptureModeGuard = phi::backends::gpu::CUDAGraphCaptureModeGuard;

template <typename T>
static bool IsBitwiseEqual(const T &x, const T &y) {
return std::memcmp(&x, &y, sizeof(T)) == 0;
}

class CUDAKernelParams {
public:
explicit CUDAKernelParams(const cudaKernelNodeParams *params)
: params_(params) {}

const void *func() const { return params_->func; }

template <typename T>
T &As(size_t idx) const {
return *reinterpret_cast<T *>(params_->kernelParams[idx]);
}

private:
const cudaKernelNodeParams *params_;
};

template <typename F, F f>
struct IsSameKernelHelper;

Expand Down Expand Up @@ -96,191 +74,5 @@ struct IsSameKernelHelper<Return (*)(FuncArgs...), kernel_fn> {
}
};

#if CUDA_VERSION >= 10010
static void ThrowErrorIfNotSupportCUDAGraph() {}
#else
enum cudaStreamCaptureMode {
cudaStreamCaptureModeGlobal = 0,
cudaStreamCaptureModeThreadLocal = 1,
cudaStreamCaptureModeRelaxed = 2
};
static void ThrowErrorIfNotSupportCUDAGraph() {
PADDLE_THROW(platform::errors::Unimplemented(
"CUDA Graph is only supported when CUDA version >= 10.1"));
}
#endif

// NOTE: Currently, we do not support to capture CUDA graph in parallel
// NOTE: Do not use this class directly because it should be used with
// the memory pool.
class CUDAGraph {
DISABLE_COPY_AND_ASSIGN(CUDAGraph);

// Since the constructor would throw error is CUDA_VERSION < 10010.
// The non-static method of CUDAGraph need not check CUDA_VERSION
// again.
CUDAGraph() {
ThrowErrorIfNotSupportCUDAGraph();
id_ = UniqueID();
}

public:
static constexpr int64_t kDefaultPoolID = 0;
static constexpr int64_t kInvalidPoolID = -1;

~CUDAGraph() { Reset(); }

CUDAGraphID ID() const { return id_; }

static int64_t SetMemoryPoolID(int64_t pool_id) {
auto &pool_id_ = capturing_graph_->pool_id_;
PADDLE_ENFORCE_EQ(
pool_id_,
kInvalidPoolID,
phi::errors::InvalidArgument("Cannot reset memory pool id twice, the "
"former memory pool id is %d.",
pool_id_));
if (pool_id <= kInvalidPoolID) {
pool_id_ = UniqueMemoryPoolID();
} else {
PADDLE_ENFORCE_GE(
pool_id,
kDefaultPoolID,
phi::errors::InvalidArgument("Invalid memory pool id %d.", pool_id));
pool_id_ = pool_id;
}
return pool_id_;
}

int64_t PoolID() const { return pool_id_; }

static int64_t CapturingPoolID() { return capturing_graph_->pool_id_; }

void Replay();

void Reset();

void AddResetCallback(std::function<void()> callback) {
std::lock_guard<std::mutex> guard(mtx_);
callbacks_.push_back(std::move(callback));
}

void PrintToDotFiles(const std::string &dirname, unsigned int flags);

static void BeginCapture(platform::CUDAPlace place,
cudaStream_t stream,
cudaStreamCaptureMode mode);
static std::unique_ptr<CUDAGraph> EndCapture();

static void BeginSegmentCapture();
static void EndSegmentCapture();

static void AddResetCallbackDuringCapturing(std::function<void()> callback) {
capturing_graph_->AddResetCallback(std::move(callback));
}

// No need to add CUDA_VERSION macro because capturing_graph_ would
// always be nullptr (constructor throws error)
static bool IsCapturing() { return capturing_graph_ != nullptr; }

static CUDAGraphID CapturingID() { return capturing_graph_->id_; }

static platform::CUDAPlace CapturingPlace() {
return capturing_graph_->place_;
}

// This API can be used to debug which GPU operation is not
// supported during capturing CUDA Graph.
static bool IsValidCapturing();

static bool IsThreadLocalCapturing() {
#if CUDA_VERSION >= 10010
return IsCapturing() &&
capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
#else
return false;
#endif
}

static bool IsThisThreadCapturing() {
if (UNLIKELY(IsCapturing())) {
return IsThreadLocalCapturing()
? capturing_thread_id_.get() == std::this_thread::get_id()
: true;
} else {
return false;
}
}

using SetSeedFunc = std::function<bool(CUDAKernelParams *, bool)>;
static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) {
std::lock_guard<std::mutex> guard(capturing_graph_->func_mtx_);
capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func));
}

static int64_t UniqueMemoryPoolID();

private:
static CUDAGraphID UniqueID();

private:
#if CUDA_VERSION >= 10010
std::vector<cudaGraph_t> graphs_;
std::vector<cudaGraphExec_t> exec_graphs_;
cudaStreamCaptureMode capture_mode_;
#endif
cudaStream_t stream_{nullptr};
platform::CUDAPlace place_;
CUDAGraphID id_;
int64_t pool_id_{kInvalidPoolID};
std::vector<std::function<void()>> callbacks_;
bool is_reset_{false};
std::mutex mtx_;

std::vector<SetSeedFunc> set_seed_funcs_;
std::vector<std::vector<std::function<void(cudaGraphExec_t)>>> pre_hooks_;
std::mutex func_mtx_;

bool is_first_run_{true};

static paddle::optional<std::thread::id> capturing_thread_id_;
static std::unique_ptr<CUDAGraph> capturing_graph_;
};

#if CUDA_VERSION >= 10010
class CUDAGraphCaptureModeGuard {
DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);

public:
explicit CUDAGraphCaptureModeGuard(
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
if (UNLIKELY(CUDAGraph::IsCapturing())) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
// After cudaThreadExchangeStreamCaptureMode is called,
// the variable "mode" would be set to the old capturing mode.
old_mode_ = mode;
}
}

~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW {
if (UNLIKELY(CUDAGraph::IsCapturing())) {
PADDLE_ENFORCE_GPU_SUCCESS(
cudaThreadExchangeStreamCaptureMode(&old_mode_));
}
}

private:
cudaStreamCaptureMode old_mode_;
};
#else
class CUDAGraphCaptureModeGuard {
DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);

public:
explicit CUDAGraphCaptureModeGuard(
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
};
#endif

} // namespace platform
} // namespace paddle
2 changes: 1 addition & 1 deletion paddle/fluid/platform/device/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ cc_library(
xpulib
device_context
op_kernel_type
phi_xpu_op_list)
phi_backends)
cc_library(
xpu_resource_pool
SRCS xpu_resource_pool.cc
Expand Down
5 changes: 3 additions & 2 deletions paddle/phi/backends/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ if(WITH_GPU OR WITH_ROCM)
list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
gpu/gpu_resources.cc)
if(WITH_GPU)
list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc)
list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc gpu/cuda/cuda_graph.cc)
endif()
if(WITH_ROCM)
list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
Expand All @@ -16,8 +16,9 @@ if(WITH_GPU OR WITH_ROCM)
endif()

if(WITH_XPU)
add_subdirectory(xpu)
list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc)
list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc
xpu/xpu2_op_list.cc)
endif()

if(WITH_MKLDNN)
Expand Down
Loading

0 comments on commit a4d9851

Please sign in to comment.