Skip to content

Commit db97837

Browse files
authored
vulkan: perf_logger improvements (#17672)
* vulkan: perf_logger improvements - Move perf_logger from device to ctx. - Add an env var to control the frequency we dump the stats. If you set a very large value, it just dumps when the ctx is destroyed. - Add a fusion info string to the tracking, only log one item per fused op. - Fix MUL_MAT_ID flops calculation. * fix vector sizes
1 parent 017761d commit db97837

File tree

1 file changed

+85
-35
lines changed

1 file changed

+85
-35
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 85 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -777,11 +777,6 @@ struct vk_device_struct {
777777
std::unique_ptr<vk_memory_logger> memory_logger;
778778
#endif
779779

780-
// for GGML_VK_PERF_LOGGER
781-
std::unique_ptr<vk_perf_logger> perf_logger;
782-
vk::QueryPool query_pool;
783-
int32_t num_queries;
784-
785780
~vk_device_struct() {
786781
VK_LOG_DEBUG("destroy device " << name);
787782

@@ -1523,12 +1518,21 @@ class vk_memory_logger {
15231518
#define VK_LOG_MEMORY(msg) ((void) 0)
15241519
#endif // GGML_VULKAN_MEMORY_DEBUG
15251520

1521+
static bool vk_perf_logger_enabled = false;
1522+
// number of calls between perf logger prints
1523+
static uint32_t vk_perf_logger_frequency = 1;
1524+
15261525
class vk_perf_logger {
15271526
public:
1528-
void print_timings() {
1527+
void print_timings(bool force = false) {
15291528
if (timings.empty()) {
15301529
return;
15311530
}
1531+
print_count++;
1532+
if ((print_count % vk_perf_logger_frequency) != 0 && !force) {
1533+
return;
1534+
}
1535+
print_count = 0;
15321536
uint64_t total_all_op_times = 0;
15331537
std::cerr << "----------------\nVulkan Timings:" << std::endl;
15341538
for (const auto & t : timings) {
@@ -1565,16 +1569,20 @@ class vk_perf_logger {
15651569
flops.clear();
15661570
}
15671571

1568-
void log_timing(const ggml_tensor * node, uint64_t time) {
1572+
void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
1573+
std::string fusion_str;
1574+
if (fusion_name) {
1575+
fusion_str = fusion_name + std::string(" ");
1576+
}
15691577
if (node->op == GGML_OP_UNARY) {
1570-
timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
1578+
timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
15711579
return;
15721580
}
15731581
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
1574-
const uint64_t m = node->src[0]->ne[1];
1575-
const uint64_t n = (node->op == GGML_OP_MUL_MAT) ? node->ne[1] : node->ne[2];
1582+
const uint64_t m = node->ne[0];
1583+
const uint64_t n = node->ne[1];
15761584
const uint64_t k = node->src[1]->ne[0];
1577-
const uint64_t batch = node->src[1]->ne[2] * node->src[1]->ne[3];
1585+
const uint64_t batch = node->ne[2] * node->ne[3];
15781586
std::string name = ggml_op_name(node->op);
15791587
if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) ||
15801588
(node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) {
@@ -1583,9 +1591,13 @@ class vk_perf_logger {
15831591
name += " ";
15841592
name += ggml_type_name(node->src[0]->type);
15851593
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
1594+
if (node->op == GGML_OP_MUL_MAT_ID) {
1595+
name += " n_expert=" + std::to_string(node->src[0]->ne[2]);
1596+
}
15861597
if (batch > 1) {
15871598
name += " batch=" + std::to_string(batch);
15881599
}
1600+
name = fusion_str + name;
15891601
timings[name].push_back(time);
15901602
flops[name].push_back(m * n * (k + (k - 1)) * batch);
15911603
return;
@@ -1607,13 +1619,15 @@ class vk_perf_logger {
16071619
uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
16081620
name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
16091621
", N=N*OW*OH=" + std::to_string(size_N);
1622+
name = fusion_str + name;
16101623
flops[name].push_back(n_flops);
16111624
timings[name].push_back(time);
16121625
return;
16131626
}
16141627
if (node->op == GGML_OP_RMS_NORM) {
16151628
std::string name = ggml_op_name(node->op);
16161629
name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
1630+
name = fusion_str + name;
16171631
timings[name].push_back(time);
16181632
return;
16191633
}
@@ -1624,6 +1638,7 @@ class vk_perf_logger {
16241638
const ggml_tensor * v = node->src[2];
16251639
const ggml_tensor * m = node->src[3];
16261640
std::stringstream name;
1641+
name << fusion_str;
16271642
name << ggml_op_name(node->op) <<
16281643
" dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " <<
16291644
" q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " <<
@@ -1635,17 +1650,19 @@ class vk_perf_logger {
16351650
}
16361651
if (node->op == GGML_OP_TOP_K) {
16371652
std::stringstream name;
1653+
name << fusion_str;
16381654
name << ggml_op_name(node->op) <<
16391655
" K=" << node->ne[0] <<
16401656
" (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
16411657
timings[name.str()].push_back(time);
16421658
return;
16431659
}
1644-
timings[ggml_op_name(node->op)].push_back(time);
1660+
timings[fusion_str + ggml_op_name(node->op)].push_back(time);
16451661
}
16461662
private:
16471663
std::map<std::string, std::vector<uint64_t>> timings;
16481664
std::map<std::string, std::vector<uint64_t>> flops;
1665+
uint32_t print_count {};
16491666
};
16501667

16511668
struct ggml_backend_vk_context {
@@ -1699,6 +1716,14 @@ struct ggml_backend_vk_context {
16991716
// Bit 'i' means nodes[start_of_fusion + i] writes to memory.
17001717
// If there's no fusion, bit 0 is still set.
17011718
int fused_ops_write_mask {};
1719+
1720+
// for GGML_VK_PERF_LOGGER
1721+
std::unique_ptr<vk_perf_logger> perf_logger;
1722+
vk::QueryPool query_pool;
1723+
std::vector<const char *> query_fusion_names;
1724+
std::vector<ggml_tensor *> query_nodes;
1725+
int32_t num_queries {};
1726+
int32_t query_idx {};
17021727
};
17031728

17041729
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
@@ -1824,8 +1849,6 @@ struct vk_instance_t {
18241849
static bool vk_instance_initialized = false;
18251850
static vk_instance_t vk_instance;
18261851

1827-
static bool vk_perf_logger_enabled = false;
1828-
18291852
#ifdef GGML_VULKAN_CHECK_RESULTS
18301853
static size_t vk_skip_checks;
18311854
static size_t vk_output_tensor;
@@ -4205,9 +4228,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
42054228
#ifdef GGML_VULKAN_MEMORY_DEBUG
42064229
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
42074230
#endif
4208-
if (vk_perf_logger_enabled) {
4209-
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
4210-
}
42114231

42124232
size_t dev_num = vk_instance.device_indices[idx];
42134233

@@ -5153,6 +5173,11 @@ static void ggml_vk_instance_init() {
51535173
}
51545174

51555175
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
5176+
const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
5177+
5178+
if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
5179+
vk_perf_logger_frequency = std::stoul(GGML_VK_PERF_LOGGER_FREQUENCY);
5180+
}
51565181

51575182
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
51585183
VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);
@@ -5330,6 +5355,10 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
53305355
ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
53315356
ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
53325357

5358+
if (vk_perf_logger_enabled) {
5359+
ctx->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
5360+
}
5361+
53335362
#ifdef GGML_VULKAN_CHECK_RESULTS
53345363
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
53355364
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -12205,6 +12234,9 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
1220512234

1220612235
ctx->compute_cmd_pool.destroy(ctx->device->device);
1220712236
ctx->transfer_cmd_pool.destroy(ctx->device->device);
12237+
if (vk_perf_logger_enabled) {
12238+
ctx->perf_logger->print_timings(true);
12239+
}
1220812240
}
1220912241

1221012242
static int ggml_vk_get_device_count() {
@@ -13003,24 +13035,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1300313035
vk_context compute_ctx;
1300413036
if (vk_perf_logger_enabled) {
1300513037
// allocate/resize the query pool
13006-
if (ctx->device->num_queries < cgraph->n_nodes + 1) {
13007-
if (ctx->device->query_pool) {
13008-
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
13038+
if (ctx->num_queries < cgraph->n_nodes + 1) {
13039+
if (ctx->query_pool) {
13040+
ctx->device->device.destroyQueryPool(ctx->query_pool);
1300913041
}
1301013042
vk::QueryPoolCreateInfo query_create_info;
1301113043
query_create_info.queryType = vk::QueryType::eTimestamp;
1301213044
query_create_info.queryCount = cgraph->n_nodes + 100;
13013-
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
13014-
ctx->device->num_queries = query_create_info.queryCount;
13045+
ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
13046+
ctx->num_queries = query_create_info.queryCount;
13047+
ctx->query_fusion_names.resize(ctx->num_queries);
13048+
ctx->query_nodes.resize(ctx->num_queries);
1301513049
}
1301613050

13017-
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
13051+
ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
13052+
std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
13053+
std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
1301813054

1301913055
GGML_ASSERT(ctx->compute_ctx.expired());
1302013056
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1302113057
ctx->compute_ctx = compute_ctx;
1302213058
ggml_vk_ctx_begin(ctx->device, compute_ctx);
13023-
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
13059+
ctx->query_idx = 0;
13060+
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
1302413061
}
1302513062

1302613063
ctx->prealloc_y_last_pipeline_used = nullptr;
@@ -13061,52 +13098,66 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1306113098
total_mul_mat_bytes += bytes;
1306213099
}
1306313100

13101+
const char *fusion_string {};
1306413102
if (!ctx->device->disable_fusion) {
1306513103
uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
1306613104
if (num_adds) {
1306713105
ctx->num_additional_fused_ops = num_adds - 1;
13106+
fusion_string = "MULTI_ADD";
1306813107
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) {
1306913108
ctx->num_additional_fused_ops = 2;
13109+
fusion_string = "MUL_MAT_ADD_ADD";
1307013110
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
1307113111
ctx->num_additional_fused_ops = 1;
13112+
fusion_string = "MUL_MAT_ADD";
1307213113
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) {
1307313114
ctx->num_additional_fused_ops = 2;
13115+
fusion_string = "MUL_MAT_ID_ADD_ID_MUL";
1307413116
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
1307513117
ctx->num_additional_fused_ops = 1;
13118+
fusion_string = "MUL_MAT_ID_ADD_ID";
1307613119
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) {
1307713120
ctx->num_additional_fused_ops = 1;
13121+
fusion_string = "MUL_MAT_ID_MUL";
1307813122
} else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) &&
1307913123
ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) &&
1308013124
ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) &&
1308113125
ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) {
1308213126
ctx->num_additional_fused_ops = 4;
13127+
fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS";
1308313128
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&&
1308413129
ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) {
1308513130
ctx->num_additional_fused_ops = 2;
13131+
fusion_string = "RMS_NORM_MUL_ROPE";
1308613132
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
1308713133
ctx->num_additional_fused_ops = 1;
13134+
fusion_string = "RMS_NORM_MUL";
1308813135
} else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
1308913136
ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
1309013137
ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
1309113138
ctx->num_additional_fused_ops = 2;
13139+
fusion_string = "ROPE_VIEW_SET_ROWS";
1309213140
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
1309313141
ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
1309413142
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
1309513143
ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
1309613144
// view of argsort writes to memory
1309713145
ctx->fused_ops_write_mask |= 1 << 3;
13146+
fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
1309813147
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
1309913148
ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
1310013149
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
1310113150
ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
1310213151
// view of argsort writes to memory
1310313152
ctx->fused_ops_write_mask |= 1 << 3;
13153+
fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
1310413154
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
1310513155
ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
1310613156
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
1310713157
ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
1310813158
// view of argsort writes to memory
1310913159
ctx->fused_ops_write_mask |= 1 << 1;
13160+
fusion_string = "TOPK_MOE_LATE_SOFTMAX";
1311013161
}
1311113162
}
1311213163
ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
@@ -13120,18 +13171,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1312013171

1312113172
bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
1312213173

13123-
if (vk_perf_logger_enabled) {
13174+
if (vk_perf_logger_enabled && enqueued) {
1312413175
if (ctx->compute_ctx.expired()) {
1312513176
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1312613177
ctx->compute_ctx = compute_ctx;
1312713178
ggml_vk_ctx_begin(ctx->device, compute_ctx);
1312813179
} else {
1312913180
compute_ctx = ctx->compute_ctx.lock();
1313013181
}
13131-
// If there are fused ops, just write out timestamps for all nodes to keep the accounting simple
13132-
for (int j = 0; j < ctx->num_additional_fused_ops + 1; ++j) {
13133-
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+j+1);
13134-
}
13182+
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
13183+
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
13184+
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
1313513185
}
1313613186

1313713187
if (enqueued) {
@@ -13172,14 +13222,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1317213222

1317313223
// Get the results and pass them to the logger
1317413224
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
13175-
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13176-
for (int i = 0; i < cgraph->n_nodes; i++) {
13177-
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
13178-
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
13179-
}
13225+
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13226+
for (int i = 1; i < ctx->query_idx; i++) {
13227+
auto node = ctx->query_nodes[i];
13228+
auto name = ctx->query_fusion_names[i];
13229+
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
1318013230
}
1318113231

13182-
ctx->device->perf_logger->print_timings();
13232+
ctx->perf_logger->print_timings();
1318313233
}
1318413234

1318513235
if (!ctx->device->support_async) {

0 commit comments

Comments
 (0)