22// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
33
44#define GGML_USE_LLAMAFILE 1
5+ // #define LLAMAFILE_SYNC_REPORT
56
67__notice(ggml_notice, "\
78llama.cpp (MIT License)\n\
@@ -38,6 +39,8 @@ SOFTWARE.");
3839#include "ggml-vector.h"
3940#include "llamafile/llamafile.h"
4041#include "llamafile/log.h"
42+ #include "llamafile/debug.h"
43+ #include "llamafile/sgemm.h"
4144
4245#include <alloca.h>
4346#include <assert.h>
@@ -55,6 +58,8 @@ SOFTWARE.");
5558#include <signal.h>
5659#include <unistd.h>
5760#include <cosmo.h>
61+ #include <threads.h>
62+ #include <sys/auxv.h>
5863#include <libc/log/backtrace.internal.h>
5964
6065#include <pthread.h>
@@ -1324,6 +1329,35 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
13241329#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
13251330#endif
13261331
1332+ ////////////////////////////////////////////////////////////////////////////////
1333+ // SYNCHRONIZATION MEASUREMENTS
1334+
1335+ int cycles[5000][100];
1336+ _Atomic(unsigned long) opstart[5000];
1337+ _Atomic(unsigned long) absolute_start;
1338+
1339+ static thread_local struct {
1340+ unsigned long work_cycles;
1341+ unsigned long wait_cycles;
1342+ unsigned long stamp;
1343+ } g_sync;
1344+
1345+ static void sync_wait_start(void) {
1346+ #ifdef LLAMAFILE_SYNC_REPORT
1347+ unsigned long now = rdtsc();
1348+ g_sync.work_cycles += now - g_sync.stamp;
1349+ g_sync.stamp = now;
1350+ #endif
1351+ }
1352+
1353+ static void sync_wait_end(void) {
1354+ #ifdef LLAMAFILE_SYNC_REPORT
1355+ unsigned long now = rdtsc();
1356+ g_sync.wait_cycles += now - g_sync.stamp;
1357+ g_sync.stamp = now;
1358+ #endif
1359+ }
1360+
13271361////////////////////////////////////////////////////////////////////////////////
13281362// SYNCHRONIZATION PRIMITIVES
13291363
@@ -1360,6 +1394,7 @@ int ggml_delay(int backoff) {
13601394
13611395// creates barrier and blocks until all threads call this
13621396void ggml_syncthreads(struct ggml_barrier * b, int nth) {
1397+ sync_wait_start();
13631398 unsigned phase = atomic_load_explicit(&b->phase, memory_order_relaxed);
13641399 if (atomic_fetch_add_explicit(&b->count, 1, memory_order_acq_rel) + 1 == nth) {
13651400 atomic_store_explicit(&b->count, 0, memory_order_relaxed);
@@ -1370,6 +1405,7 @@ void ggml_syncthreads(struct ggml_barrier * b, int nth) {
13701405 backoff = ggml_delay(backoff);
13711406 }
13721407 }
1408+ sync_wait_end();
13731409}
13741410
13751411//
@@ -15862,6 +15898,15 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1586215898 return;
1586315899 }
1586415900
15901+ #ifdef LLAMAFILE_SYNC_REPORT
15902+ long start = rdtsc();
15903+ unsigned long old = 0;
15904+ atomic_compare_exchange_strong_explicit(&opstart[llamafile_debug_op_index],
15905+ &old, start,
15906+ memory_order_relaxed,
15907+ memory_order_relaxed);
15908+ #endif
15909+
1586515910 switch (tensor->op) {
1586615911 case GGML_OP_DUP:
1586715912 {
@@ -16203,6 +16248,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1620316248 } break;
1620416249 }
1620516250
16251+ #ifdef LLAMAFILE_SYNC_REPORT
16252+ long end = rdtsc();
16253+ cycles[llamafile_debug_op_index][params->ith] += end - start;
16254+ #endif
16255+
1620616256#ifdef LLAMAFILE_DEBUG
1620716257 llamafile_trapping_restore();
1620816258#endif
@@ -17692,6 +17742,7 @@ struct ggml_compute_state_shared {
1769217742 const int n_threads;
1769317743
1769417744 // synchronization primitives
17745+ atomic_int n_alive; // num threads alive
1769517746 atomic_int n_active; // num active threads
1769617747 atomic_int node_n; // active graph node
1769717748 atomic_int node_task; // active graph node task phase
@@ -17977,23 +18028,27 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
1797718028 const int last_node_n = * node_n;
1797818029 int backoff = 0;
1797918030
18031+ sync_wait_start();
1798018032 while (true) {
1798118033 * node_n = atomic_load_explicit(&state->shared->node_n, memory_order_acquire);
1798218034 if (* node_n != last_node_n) break;
1798318035 backoff = ggml_delay(backoff);
1798418036 }
18037+ sync_wait_end();
1798518038}
1798618039
1798718040static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
1798818041 // wait for other threads to finish
1798918042 const int last_task_phase = * task_phase;
1799018043 int backoff = 0;
1799118044
18045+ sync_wait_start();
1799218046 while (true) {
1799318047 * task_phase = atomic_load_explicit(&state->shared->node_task, memory_order_acquire);
1799418048 if (* task_phase != last_task_phase) break;
1799518049 backoff = ggml_delay(backoff);
1799618050 }
18051+ sync_wait_end();
1799718052}
1799818053
1799918054static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -18015,10 +18070,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1801518070 }
1801618071#endif
1801718072
18073+ #ifdef LLAMAFILE_SYNC_REPORT
18074+ g_sync.stamp = rdtsc();
18075+ unsigned long old = 0;
18076+ atomic_compare_exchange_strong_explicit(&absolute_start,
18077+ &old, g_sync.stamp,
18078+ memory_order_relaxed,
18079+ memory_order_relaxed);
18080+ #endif
18081+
1801818082 while (true) {
1801918083 if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
1802018084 state->shared->node_n += 1;
1802118085 state->ec = GGML_STATUS_ABORTED;
18086+ atomic_fetch_sub_explicit(&state->shared->n_alive, 1, memory_order_release);
1802218087 return 0;
1802318088 }
1802418089
@@ -18034,6 +18099,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1803418099 /*.barrier =*/ &state->shared->barrier,
1803518100 };
1803618101
18102+ #ifdef LLAMAFILE_DEBUG
18103+ llamafile_debug_op_index = node_n;
18104+ #endif
18105+
1803718106 if (node_n != -1) {
1803818107 /* FINALIZE */
1803918108 struct ggml_tensor * node = cgraph->nodes[node_n];
@@ -18217,6 +18286,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1821718286
1821818287 }
1821918288
18289+ #ifdef LLAMAFILE_SYNC_REPORT
18290+ g_sync.work_cycles += rdtsc() - g_sync.stamp;
18291+ double total = g_sync.work_cycles + g_sync.wait_cycles;
18292+ int workpercent = g_sync.work_cycles / total * 100.5;
18293+ fprintf(stderr, "SYNC %03d %3d%% working\n", state->ith, workpercent);
18294+ #endif
18295+
18296+ atomic_fetch_sub_explicit(&state->shared->n_alive, 1, memory_order_release);
1822018297 return 0;
1822118298}
1822218299
@@ -18432,6 +18509,45 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1843218509 return cplan;
1843318510}
1843418511
18512+ static const char *describe_shape(char buf[100], struct ggml_tensor *t) {
18513+ char *p = buf;
18514+ *p++ = '[';
18515+ for (int i = 0; i < GGML_MAX_DIMS && t->ne[i] != 1; ++i) {
18516+ if (i) p += sprintf(p, ", ");
18517+ p += sprintf(p, "%d", t->ne[i]);
18518+ }
18519+ *p++ = ']';
18520+ *p = 0;
18521+ return buf;
18522+ }
18523+
18524+ static const char *describe_vertex(struct ggml_tensor *t) {
18525+ if (t->op == GGML_OP_UNARY)
18526+ return ggml_unary_op_name((enum ggml_unary_op)t->op_params[0]);
18527+ if (t->op)
18528+ return ggml_op_name(t->op);
18529+ return t->name;
18530+ }
18531+
18532+ static void print_graph(FILE *f, const struct ggml_cgraph *g, int n_threads) {
18533+ char shape[100];
18534+ for (int i = 0; i < g->n_nodes; ++i) {
18535+ int n_tasks = ggml_get_n_tasks(g->nodes[i], 8, 4);
18536+ fprintf(f, "%5d %p %s:%s%s(", i, g->nodes[i], ggml_type_name(g->nodes[i]->type),
18537+ describe_vertex(g->nodes[i]),
18538+ describe_shape(shape, g->nodes[i]));
18539+ for (int j = 0; j < GGML_MAX_SRC && g->nodes[i]->src[j]; ++j) {
18540+ if (j)
18541+ fprintf(f, ", ");
18542+ fprintf(f, "%s:%s%s[%p]", ggml_type_name(g->nodes[i]->src[j]->type),
18543+ describe_vertex(g->nodes[i]->src[j]),
18544+ describe_shape(shape, g->nodes[i]->src[j]),
18545+ g->nodes[i]->src[j]);
18546+ }
18547+ fprintf(f, ") n_tasks=%d\n", n_tasks);
18548+ }
18549+ }
18550+
1843518551enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
1843618552 {
1843718553 GGML_ASSERT(cplan);
@@ -18442,6 +18558,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1844218558 }
1844318559 }
1844418560
18561+ #ifdef LLAMAFILE_SYNC_REPORT
18562+ fprintf(stderr, "\n");
18563+ #endif
18564+
1844518565 const int n_threads = cplan->n_threads;
1844618566
1844718567 struct ggml_compute_state_shared state_shared = {
@@ -18450,6 +18570,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1845018570 /*.perf_node_start_cycles =*/ 0,
1845118571 /*.perf_node_start_time_us =*/ 0,
1845218572 /*.n_threads =*/ n_threads,
18573+ /*.n_alive =*/ n_threads,
1845318574 /*.n_active =*/ n_threads,
1845418575 /*.node_n =*/ -1,
1845518576 /*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
@@ -18503,6 +18624,37 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1850318624 }
1850418625 }
1850518626
18627+ #ifdef LLAMAFILE_SYNC_REPORT
18628+ opstart[cgraph->n_nodes] = rdtsc();
18629+ static int lol;
18630+ if (++lol == 2) {
18631+ const char *path = "/tmp/cgraph.txt";
18632+ FILE *f = fopen(path, "w");
18633+ if (!f) {
18634+ const char *help;
18635+ if (errno == EPERM) {
18636+ help = " (you need to pass the --unsecure flag)";
18637+ } else {
18638+ help = "";
18639+ }
18640+ fprintf(stderr, "%s: %s%s\n", path, strerror(errno), help);
18641+ exit(1);
18642+ }
18643+ print_graph(f, cgraph, cplan->n_threads);
18644+ fclose(f);
18645+ fprintf(stderr, "\n");
18646+ }
18647+ for (int i = 0; i < cgraph->n_nodes; ++i) {
18648+ fprintf(stderr, "OP %-11s %12ld %4d", describe_vertex(cgraph->nodes[i]), opstart[i + 1] - opstart[i], i);
18649+ for (int j = 0; j < n_threads; ++j) {
18650+ fprintf(stderr, " %10d", cycles[i][j]);
18651+ }
18652+ fprintf(stderr, "\n");
18653+ }
18654+ memset(opstart, 0, sizeof(opstart));
18655+ memset(cycles, 0, sizeof(cycles));
18656+ #endif
18657+
1850618658 // performance stats (graph)
1850718659 {
1850818660 int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
0 commit comments