Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

record memory and op supplement info #43550

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions paddle/fluid/framework/new_executor/interpretercore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#include "paddle/phi/core/kernel_context.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
Expand Down Expand Up @@ -485,6 +486,10 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
op_with_kernel->Info().infer_shape_(
instr_node.InnerInferShapeContext().get());
}
infershape_event.End();
platform::RecordOpInfoSupplement(op->Type(), op->Attrs(),
*(instr_node.InnerInferShapeContext()),
*(instr_node.InnerRuntimeContext()));
}
}

Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/kernel_context.h"
Expand Down Expand Up @@ -1512,6 +1513,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
1, platform::EventRole::kInnerOp);
RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
this->Info().infer_shape_(&infer_shape_ctx);
record_event.End();
platform::RecordOpInfoSupplement(Type(), Attrs(), infer_shape_ctx,
*runtime_ctx);
}

if (FLAGS_enable_unused_var_check) {
Expand Down
30 changes: 29 additions & 1 deletion paddle/fluid/memory/allocation/stat_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"

namespace paddle {
namespace memory {
Expand All @@ -33,11 +34,24 @@ class StatAllocator : public Allocator {
if (platform::is_cpu_place(allocation->place())) {
HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
platform::RecordMemEvent(
allocation->ptr(), allocation->place(), allocation->size(),
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated,
allocation->place().GetDeviceId()),
HOST_MEMORY_STAT_PEAK_VALUE(Allocated,
allocation->place().GetDeviceId()),
platform::TracerMemEventType::Free);
} else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
platform::RecordMemEvent(
allocation->ptr(), allocation->place(), allocation->size(),
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated,
allocation->place().GetDeviceId()),
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated,
allocation->place().GetDeviceId()),
platform::TracerMemEventType::Free);
}

underlying_allocator_->Free(allocation);
}

Expand All @@ -50,9 +64,23 @@ class StatAllocator : public Allocator {
platform::is_cuda_pinned_place(place)) {
HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
allocation->size());
platform::RecordMemEvent(
allocation->ptr(), allocation->place(), allocation->size(),
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated,
allocation->place().GetDeviceId()),
HOST_MEMORY_STAT_PEAK_VALUE(Allocated,
allocation->place().GetDeviceId()),
platform::TracerMemEventType::Allocate);
} else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
allocation->size());
platform::RecordMemEvent(
allocation->ptr(), allocation->place(), allocation->size(),
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated,
allocation->place().GetDeviceId()),
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated,
allocation->place().GetDeviceId()),
platform::TracerMemEventType::Allocate);
}
return allocation.release();
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/memory/memcpy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License. */

#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/common/place.h"

#ifdef PADDLE_WITH_XPU
Expand Down
21 changes: 18 additions & 3 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,9 @@ if(WITH_GPU)
enforce
dynload_cuda
new_profiler
stats)
stats
op_proto_maker
shape_inference)
nv_library(
device_memory_aligment
SRCS device_memory_aligment.cc
Expand All @@ -363,7 +365,14 @@ elseif(WITH_ROCM)
hip_library(
profiler
SRCS profiler.cc profiler.cu
DEPS os_info device_tracer gpu_info enforce new_profiler stats)
DEPS os_info
device_tracer
gpu_info
enforce
new_profiler
stats
op_proto_maker
shape_inference)
hip_library(
device_memory_aligment
SRCS device_memory_aligment.cc
Expand All @@ -372,7 +381,13 @@ else()
cc_library(
profiler
SRCS profiler.cc
DEPS os_info device_tracer enforce new_profiler stats)
DEPS os_info
device_tracer
enforce
new_profiler
stats
op_proto_maker
shape_inference)
cc_library(
device_memory_aligment
SRCS device_memory_aligment.cc
Expand Down
84 changes: 84 additions & 0 deletions paddle/fluid/platform/profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nvtx.h"
#endif
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/platform/os_info.h"

PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
Expand Down Expand Up @@ -236,6 +237,45 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
}

RecordOpInfoSupplement::RecordOpInfoSupplement(
const std::string &type, const framework::AttributeMap &attrs,
const framework::InferShapeContext &shape_ctx,
const framework::RuntimeContext &ctx) {
if (FLAGS_enable_host_event_recorder_hook == false) {
return;
}
std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) {
input_shapes[it->first] = shape_ctx.GetInputsDim(it->first);
dtypes[it->first] = shape_ctx.GetInputsVarType(it->first);
}

const std::vector<std::string> *callstack_ptr = nullptr;
std::vector<std::string> callstack;
auto iter = attrs.find(
framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (iter != attrs.end()) {
callstack_ptr = &BOOST_GET_CONST(std::vector<std::string>, iter->second);
callstack = *callstack_ptr;
}
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance().RecordEvent(
PosixInNsec(), type, input_shapes, dtypes, callstack);
}

RecordMemEvent::RecordMemEvent(const void *ptr, const phi::Place &place,
size_t size, uint64_t current_allocated,
uint64_t current_reserved,
const TracerMemEventType type) {
if (type == TracerMemEventType::Allocate) {
platform::MemEvenRecorder::Instance().PushMemRecord(
ptr, place, size, current_allocated, current_reserved);
} else if (type == TracerMemEventType::Free) {
platform::MemEvenRecorder::Instance().PopMemRecord(
ptr, place, size, current_allocated, current_reserved);
}
}

void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
size_t size) {
if (g_state == ProfilerState::kDisabled) return;
Expand All @@ -248,6 +288,28 @@ void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
new MemEvenRecorder::RecordMemEvent(place, size)));
}

void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
size_t size, uint64_t current_allocated,
uint64_t current_reserved) {
if (g_state == ProfilerState::kDisabled &&
FLAGS_enable_host_event_recorder_hook == false)
return;
std::lock_guard<std::mutex> guard(mtx_);
if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord
HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
PosixInNsec(), reinterpret_cast<uint64_t>(ptr),
TracerMemEventType::Allocate, size, place, current_allocated,
current_reserved);
return;
}
auto &events = address_memevent_[place];
PADDLE_ENFORCE_EQ(events.count(ptr), 0,
platform::errors::InvalidArgument(
"The Place can't exist in the stage of PushMemRecord"));
events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}

void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> guard(mtx_);
Expand All @@ -259,6 +321,28 @@ void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
}
}

void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place,
size_t size, uint64_t current_allocated,
uint64_t current_reserved) {
if (g_state == ProfilerState::kDisabled &&
FLAGS_enable_host_event_recorder_hook == false)
return;
std::lock_guard<std::mutex> guard(mtx_);
if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord
HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
PosixInNsec(), reinterpret_cast<uint64_t>(ptr),
TracerMemEventType::Free, -size, place, current_allocated,
current_reserved);
return;
}
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
if (iter != events.end()) {
events.erase(iter);
}
}

void MemEvenRecorder::Flush() {
std::lock_guard<std::mutex> guard(mtx_);
address_memevent_.clear();
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/platform/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.pb.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
Expand Down Expand Up @@ -102,6 +103,10 @@ struct MemEvenRecorder {
public:
void PushMemRecord(const void* ptr, const Place& place, size_t size);
void PopMemRecord(const void* ptr, const Place& place);
void PushMemRecord(const void* ptr, const Place& place, size_t size,
uint64_t current_allocated, uint64_t current_reserved);
void PopMemRecord(const void* ptr, const Place& place, size_t size,
uint64_t current_allocated, uint64_t current_reserved);
void Flush();
static MemEvenRecorder& Instance() { return recorder; }

Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/platform/profiler/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cc_library(
host_tracer
SRCS host_tracer.cc
DEPS enforce)
DEPS enforce ddim var_type_traits)
cc_library(
cuda_tracer
SRCS cuda_tracer.cc cupti_data_process.cc
Expand All @@ -10,7 +10,7 @@ add_subdirectory(mlu)
cc_library(
event_node
SRCS event_node.cc
DEPS enforce)
DEPS enforce place)
cc_library(
profiler_utils
SRCS utils.cc
Expand Down
8 changes: 6 additions & 2 deletions paddle/fluid/platform/profiler/chrometracing_logger.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License. */
#include <cstdio>
#include <ctime>
#include <limits>
#include <regex>

#include "glog/logging.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
Expand Down Expand Up @@ -125,7 +126,7 @@ void ChromeTracingLogger::LogMemTraceEventNode(
std::string(
R"JSON(
{
"name": "[memory]", "pid": %lld, "tid": "%lld",
"name": "[memory]", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld,
"ph": "i", "cat": "%s",
"args": {
Expand All @@ -137,10 +138,11 @@ void ChromeTracingLogger::LogMemTraceEventNode(
}
},
)JSON"),
mem_node.ProcessId(), mem_node.ThreadId(), mem_node.TimeStampNs(),
mem_node.ProcessId(), mem_node.ThreadId(), nsToUs(mem_node.TimeStampNs()),
StringTracerMemEventType(mem_node.Type()), mem_node.Place().c_str(),
mem_node.Addr(), mem_node.CurrentAllocated(), mem_node.CurrentReserved(),
mem_node.IncreaseBytes());
pid_tid_set_.insert({mem_node.ProcessId(), mem_node.ThreadId()});
}

void ChromeTracingLogger::LogHostTraceEventNode(
Expand All @@ -164,6 +166,8 @@ void ChromeTracingLogger::LogHostTraceEventNode(
input_shapes = op_supplement_node->InputShapes();
input_dtypes = op_supplement_node->Dtypes();
callstack = op_supplement_node->CallStack();
callstack = std::regex_replace(callstack, std::regex("\""), "\'");
callstack = std::regex_replace(callstack, std::regex("\n"), "\\n");
}
switch (host_node.Type()) {
case TracerEventType::ProfileStep:
Expand Down
49 changes: 49 additions & 0 deletions paddle/fluid/platform/profiler/common_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
#include <functional>
#include <string>

#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later
#include "paddle/fluid/platform/profiler/trace_event.h"
#include "paddle/phi/core/ddim.h"

namespace paddle {
namespace platform {
Expand Down Expand Up @@ -63,5 +65,52 @@ struct CommonEvent {
const char *attr = nullptr; // not owned, designed for performance
};

struct CommonMemEvent {
public:
CommonMemEvent(uint64_t timestamp_ns, uint64_t addr, TracerMemEventType type,
int64_t increase_bytes, const Place &place,
uint64_t current_allocated, uint64_t current_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
increase_bytes(increase_bytes),
place(place),
current_allocated(current_allocated),
current_reserved(current_reserved) {}
uint64_t timestamp_ns;
uint64_t addr;
TracerMemEventType type;
int64_t increase_bytes;
Place place;
uint64_t current_allocated;
uint64_t current_reserved;
};

struct OperatorSupplementOriginEvent {
public:
OperatorSupplementOriginEvent(
std::function<void *(size_t)> arena_allocator, uint64_t timestamp_ns,
const std::string &type_name,
const std::map<std::string, std::vector<framework::DDim>> &input_shapes,
const std::map<std::string, std::vector<framework::proto::VarType::Type>>
&dtypes,
const std::vector<std::string> callstack)
: timestamp_ns(timestamp_ns),
input_shapes(input_shapes),
dtypes(dtypes),
callstack(callstack) {
auto buf = static_cast<char *>(arena_allocator(type_name.length() + 1));
strncpy(buf, type_name.c_str(), type_name.length() + 1);
op_type = buf;
}
uint64_t timestamp_ns;
const char *op_type = nullptr; // not owned, designed for performance
// input shapes
std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
// call stack
const std::vector<std::string> callstack;
};

} // namespace platform
} // namespace paddle
Loading