Skip to content

Commit 8dae978

Browse files
committed
Create tool for synchronization reports
1 parent b749326 commit 8dae978

File tree

11 files changed

+187
-26
lines changed

11 files changed

+187
-26
lines changed

llama.cpp/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "common.h"
55
#include "json.h"
66
#include "json-schema-to-grammar.h"
7+
#include "llamafile/debug.h"
78
#include "llama.h"
89

910
#include <algorithm>

llama.cpp/ggml.c

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
33

44
#define GGML_USE_LLAMAFILE 1
5+
// #define LLAMAFILE_SYNC_REPORT
56

67
__notice(ggml_notice, "\
78
llama.cpp (MIT License)\n\
@@ -38,6 +39,8 @@ SOFTWARE.");
3839
#include "ggml-vector.h"
3940
#include "llamafile/llamafile.h"
4041
#include "llamafile/log.h"
42+
#include "llamafile/debug.h"
43+
#include "llamafile/sgemm.h"
4144

4245
#include <alloca.h>
4346
#include <assert.h>
@@ -55,6 +58,8 @@ SOFTWARE.");
5558
#include <signal.h>
5659
#include <unistd.h>
5760
#include <cosmo.h>
61+
#include <threads.h>
62+
#include <sys/auxv.h>
5863
#include <libc/log/backtrace.internal.h>
5964

6065
#include <pthread.h>
@@ -1324,6 +1329,35 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
13241329
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
13251330
#endif
13261331

1332+
////////////////////////////////////////////////////////////////////////////////
1333+
// SYNCHRONIZATION MEASUREMENTS
1334+
1335+
int cycles[5000][100];
1336+
_Atomic(unsigned long) opstart[5000];
1337+
_Atomic(unsigned long) absolute_start;
1338+
1339+
static thread_local struct {
1340+
unsigned long work_cycles;
1341+
unsigned long wait_cycles;
1342+
unsigned long stamp;
1343+
} g_sync;
1344+
1345+
static void sync_wait_start(void) {
1346+
#ifdef LLAMAFILE_SYNC_REPORT
1347+
unsigned long now = rdtsc();
1348+
g_sync.work_cycles += now - g_sync.stamp;
1349+
g_sync.stamp = now;
1350+
#endif
1351+
}
1352+
1353+
static void sync_wait_end(void) {
1354+
#ifdef LLAMAFILE_SYNC_REPORT
1355+
unsigned long now = rdtsc();
1356+
g_sync.wait_cycles += now - g_sync.stamp;
1357+
g_sync.stamp = now;
1358+
#endif
1359+
}
1360+
13271361
////////////////////////////////////////////////////////////////////////////////
13281362
// SYNCHRONIZATION PRIMITIVES
13291363

@@ -1360,6 +1394,7 @@ int ggml_delay(int backoff) {
13601394

13611395
// creates barrier and blocks until all threads call this
13621396
void ggml_syncthreads(struct ggml_barrier * b, int nth) {
1397+
sync_wait_start();
13631398
unsigned phase = atomic_load_explicit(&b->phase, memory_order_relaxed);
13641399
if (atomic_fetch_add_explicit(&b->count, 1, memory_order_acq_rel) + 1 == nth) {
13651400
atomic_store_explicit(&b->count, 0, memory_order_relaxed);
@@ -1370,6 +1405,7 @@ void ggml_syncthreads(struct ggml_barrier * b, int nth) {
13701405
backoff = ggml_delay(backoff);
13711406
}
13721407
}
1408+
sync_wait_end();
13731409
}
13741410

13751411
//
@@ -15862,6 +15898,15 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1586215898
return;
1586315899
}
1586415900

15901+
#ifdef LLAMAFILE_SYNC_REPORT
15902+
long start = rdtsc();
15903+
unsigned long old = 0;
15904+
atomic_compare_exchange_strong_explicit(&opstart[llamafile_debug_op_index],
15905+
&old, start,
15906+
memory_order_relaxed,
15907+
memory_order_relaxed);
15908+
#endif
15909+
1586515910
switch (tensor->op) {
1586615911
case GGML_OP_DUP:
1586715912
{
@@ -16203,6 +16248,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1620316248
} break;
1620416249
}
1620516250

16251+
#ifdef LLAMAFILE_SYNC_REPORT
16252+
long end = rdtsc();
16253+
cycles[llamafile_debug_op_index][params->ith] += end - start;
16254+
#endif
16255+
1620616256
#ifdef LLAMAFILE_DEBUG
1620716257
llamafile_trapping_restore();
1620816258
#endif
@@ -17692,6 +17742,7 @@ struct ggml_compute_state_shared {
1769217742
const int n_threads;
1769317743

1769417744
// synchronization primitives
17745+
atomic_int n_alive; // num threads alive
1769517746
atomic_int n_active; // num active threads
1769617747
atomic_int node_n; // active graph node
1769717748
atomic_int node_task; // active graph node task phase
@@ -17977,23 +18028,27 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
1797718028
const int last_node_n = * node_n;
1797818029
int backoff = 0;
1797918030

18031+
sync_wait_start();
1798018032
while (true) {
1798118033
* node_n = atomic_load_explicit(&state->shared->node_n, memory_order_acquire);
1798218034
if (* node_n != last_node_n) break;
1798318035
backoff = ggml_delay(backoff);
1798418036
}
18037+
sync_wait_end();
1798518038
}
1798618039

1798718040
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
1798818041
// wait for other threads to finish
1798918042
const int last_task_phase = * task_phase;
1799018043
int backoff = 0;
1799118044

18045+
sync_wait_start();
1799218046
while (true) {
1799318047
* task_phase = atomic_load_explicit(&state->shared->node_task, memory_order_acquire);
1799418048
if (* task_phase != last_task_phase) break;
1799518049
backoff = ggml_delay(backoff);
1799618050
}
18051+
sync_wait_end();
1799718052
}
1799818053

1799918054
static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -18015,10 +18070,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1801518070
}
1801618071
#endif
1801718072

18073+
#ifdef LLAMAFILE_SYNC_REPORT
18074+
g_sync.stamp = rdtsc();
18075+
unsigned long old = 0;
18076+
atomic_compare_exchange_strong_explicit(&absolute_start,
18077+
&old, g_sync.stamp,
18078+
memory_order_relaxed,
18079+
memory_order_relaxed);
18080+
#endif
18081+
1801818082
while (true) {
1801918083
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
1802018084
state->shared->node_n += 1;
1802118085
state->ec = GGML_STATUS_ABORTED;
18086+
atomic_fetch_sub_explicit(&state->shared->n_alive, 1, memory_order_release);
1802218087
return 0;
1802318088
}
1802418089

@@ -18034,6 +18099,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1803418099
/*.barrier =*/ &state->shared->barrier,
1803518100
};
1803618101

18102+
#ifdef LLAMAFILE_DEBUG
18103+
llamafile_debug_op_index = node_n;
18104+
#endif
18105+
1803718106
if (node_n != -1) {
1803818107
/* FINALIZE */
1803918108
struct ggml_tensor * node = cgraph->nodes[node_n];
@@ -18217,6 +18286,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1821718286

1821818287
}
1821918288

18289+
#ifdef LLAMAFILE_SYNC_REPORT
18290+
g_sync.work_cycles += rdtsc() - g_sync.stamp;
18291+
double total = g_sync.work_cycles + g_sync.wait_cycles;
18292+
int workpercent = g_sync.work_cycles / total * 100.5;
18293+
fprintf(stderr, "SYNC %03d %3d%% working\n", state->ith, workpercent);
18294+
#endif
18295+
18296+
atomic_fetch_sub_explicit(&state->shared->n_alive, 1, memory_order_release);
1822018297
return 0;
1822118298
}
1822218299

@@ -18432,6 +18509,45 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1843218509
return cplan;
1843318510
}
1843418511

18512+
static const char *describe_shape(char buf[100], struct ggml_tensor *t) {
18513+
char *p = buf;
18514+
*p++ = '[';
18515+
for (int i = 0; i < GGML_MAX_DIMS && t->ne[i] != 1; ++i) {
18516+
if (i) p += sprintf(p, ", ");
18517+
p += sprintf(p, "%d", t->ne[i]);
18518+
}
18519+
*p++ = ']';
18520+
*p = 0;
18521+
return buf;
18522+
}
18523+
18524+
static const char *describe_vertex(struct ggml_tensor *t) {
18525+
if (t->op == GGML_OP_UNARY)
18526+
return ggml_unary_op_name((enum ggml_unary_op)t->op_params[0]);
18527+
if (t->op)
18528+
return ggml_op_name(t->op);
18529+
return t->name;
18530+
}
18531+
18532+
static void print_graph(FILE *f, const struct ggml_cgraph *g, int n_threads) {
18533+
char shape[100];
18534+
for (int i = 0; i < g->n_nodes; ++i) {
18535+
int n_tasks = ggml_get_n_tasks(g->nodes[i], 8, 4);
18536+
fprintf(f, "%5d %p %s:%s%s(", i, g->nodes[i], ggml_type_name(g->nodes[i]->type),
18537+
describe_vertex(g->nodes[i]),
18538+
describe_shape(shape, g->nodes[i]));
18539+
for (int j = 0; j < GGML_MAX_SRC && g->nodes[i]->src[j]; ++j) {
18540+
if (j)
18541+
fprintf(f, ", ");
18542+
fprintf(f, "%s:%s%s[%p]", ggml_type_name(g->nodes[i]->src[j]->type),
18543+
describe_vertex(g->nodes[i]->src[j]),
18544+
describe_shape(shape, g->nodes[i]->src[j]),
18545+
g->nodes[i]->src[j]);
18546+
}
18547+
fprintf(f, ") n_tasks=%d\n", n_tasks);
18548+
}
18549+
}
18550+
1843518551
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
1843618552
{
1843718553
GGML_ASSERT(cplan);
@@ -18442,6 +18558,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1844218558
}
1844318559
}
1844418560

18561+
#ifdef LLAMAFILE_SYNC_REPORT
18562+
fprintf(stderr, "\n");
18563+
#endif
18564+
1844518565
const int n_threads = cplan->n_threads;
1844618566

1844718567
struct ggml_compute_state_shared state_shared = {
@@ -18450,6 +18570,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1845018570
/*.perf_node_start_cycles =*/ 0,
1845118571
/*.perf_node_start_time_us =*/ 0,
1845218572
/*.n_threads =*/ n_threads,
18573+
/*.n_alive =*/ n_threads,
1845318574
/*.n_active =*/ n_threads,
1845418575
/*.node_n =*/ -1,
1845518576
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
@@ -18503,6 +18624,37 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1850318624
}
1850418625
}
1850518626

18627+
#ifdef LLAMAFILE_SYNC_REPORT
18628+
opstart[cgraph->n_nodes] = rdtsc();
18629+
static int lol;
18630+
if (++lol == 2) {
18631+
const char *path = "/tmp/cgraph.txt";
18632+
FILE *f = fopen(path, "w");
18633+
if (!f) {
18634+
const char *help;
18635+
if (errno == EPERM) {
18636+
help = " (you need to pass the --unsecure flag)";
18637+
} else {
18638+
help = "";
18639+
}
18640+
fprintf(stderr, "%s: %s%s\n", path, strerror(errno), help);
18641+
exit(1);
18642+
}
18643+
print_graph(f, cgraph, cplan->n_threads);
18644+
fclose(f);
18645+
fprintf(stderr, "\n");
18646+
}
18647+
for (int i = 0; i < cgraph->n_nodes; ++i) {
18648+
fprintf(stderr, "OP %-11s %12ld %4d", describe_vertex(cgraph->nodes[i]), opstart[i + 1] - opstart[i], i);
18649+
for (int j = 0; j < n_threads; ++j) {
18650+
fprintf(stderr, " %10d", cycles[i][j]);
18651+
}
18652+
fprintf(stderr, "\n");
18653+
}
18654+
memset(opstart, 0, sizeof(opstart));
18655+
memset(cycles, 0, sizeof(cycles));
18656+
#endif
18657+
1850618658
// performance stats (graph)
1850718659
{
1850818660
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;

llama.cpp/server/server.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
22
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
3+
34
#include "llama.cpp/common.h"
45
#include "llama.cpp/llama.h"
56
#include "llama.cpp/grammar-parser.h"
@@ -9,6 +10,7 @@
910
#include "oai.h"
1011
#include "llamafile/micros.h"
1112
#include "llamafile/llamafile.h"
13+
#include "llamafile/debug.h"
1214
#include "macsandbox.h"
1315

1416
// increase max payload length to allow use of larger context size

llamafile/debug.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// See the License for the specific language governing permissions and
1616
// limitations under the License.
1717

18-
#include "llamafile.h"
18+
#include "debug.h"
1919
#include "log.h"
2020

2121
#include <cosmo.h>

llamafile/debug.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#ifndef LLAMAFILE_DEBUG_H_
2+
#define LLAMAFILE_DEBUG_H_
3+
#include <threads.h>
4+
#ifdef __cplusplus
5+
extern "C" {
6+
#endif
7+
8+
struct StackFrame;
9+
struct ggml_cgraph;
10+
int llamafile_trapping_enabled(int);
11+
void llamafile_trapping_restore(void);
12+
void ShowBacktrace(int, const struct StackFrame *);
13+
extern const struct ggml_cgraph *llamafile_debug_graph;
14+
extern thread_local int llamafile_debug_op_index;
15+
16+
#ifdef __cplusplus
17+
}
18+
#endif
19+
#endif /* LLAMAFILE_DEBUG_H_ */

llamafile/llamafile.h

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#define LLAMAFILE_H_
33
#include <stdbool.h>
44
#include <stdio.h>
5-
#include <threads.h>
65
#ifdef __cplusplus
76
extern "C" {
87
#endif
@@ -51,26 +50,6 @@ bool llamafile_has_amd_gpu(void);
5150
int llamafile_gpu_parse(const char *);
5251
const char *llamafile_describe_gpu(void);
5352

54-
bool llamafile_sgemm(long, long, long, const void *, long, const void *, long, void *, long, int,
55-
int, int, int, int, int, int);
56-
57-
struct ggml_tensor;
58-
struct ggml_compute_params;
59-
bool llamafile_mixmul(const struct ggml_compute_params *, const struct ggml_tensor *,
60-
const struct ggml_tensor *, const struct ggml_tensor *, struct ggml_tensor *);
61-
size_t llamafile_mixmul_needs(const struct ggml_tensor *, const struct ggml_tensor *,
62-
const struct ggml_tensor *);
63-
64-
struct StackFrame;
65-
struct ggml_cgraph;
66-
int feenableexcept(int);
67-
int fedisableexcept(int);
68-
int llamafile_trapping_enabled(int);
69-
void llamafile_trapping_restore(void);
70-
void ShowBacktrace(int, const struct StackFrame *);
71-
extern const struct ggml_cgraph *llamafile_debug_graph;
72-
extern thread_local int llamafile_debug_op_index;
73-
7453
#ifdef __cplusplus
7554
}
7655
#endif

llamafile/sgemm.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ extern "C" {
77
struct ggml_tensor;
88
struct ggml_compute_params;
99

10+
bool llamafile_sgemm(long, long, long, const void *, long, const void *, long, void *, long, int,
11+
int, int, int, int, int, int);
12+
bool llamafile_mixmul(const struct ggml_compute_params *, const struct ggml_tensor *,
13+
const struct ggml_tensor *, const struct ggml_tensor *, struct ggml_tensor *);
14+
size_t llamafile_mixmul_needs(const struct ggml_tensor *, const struct ggml_tensor *,
15+
const struct ggml_tensor *);
16+
1017
bool llamafile_sgemm_unsupported(long, long, long, const void *, long, const void *, long, void *,
1118
long, int, int, int, int, int, int, int);
1219
bool llamafile_sgemm_amd_avx(long, long, long, const void *, long, const void *, long, void *, long,

0 commit comments

Comments
 (0)