15
15
#include < iterator>
16
16
#include < algorithm>
17
17
18
- float tensor_sum_elements (struct ggml_tensor * tensor) {
18
+ float tensor_sum_elements (const ggml_tensor * tensor) {
19
19
float sum = 0 ;
20
20
if (tensor->type ==GGML_TYPE_F32) {
21
21
for (int j = 0 ; j < tensor->ne [1 ]; j++) {
@@ -27,21 +27,13 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
27
27
return sum;
28
28
}
29
29
30
-
31
- /*
32
- These are mapping to unknown
33
- GGML_TYPE_I8,
34
- GGML_TYPE_I16,
35
- GGML_TYPE_I32,
36
- GGML_TYPE_COUNT,
37
- */
38
-
39
- #define TENSOR_TYPE_AS_STR (TYPE ) TYPE == GGML_TYPE_F32 ? " FP32" : TYPE == GGML_TYPE_F16 ? " FP16" : TYPE == GGML_TYPE_Q4_0 ? " Q4_0" : TYPE == GGML_TYPE_Q4_1 ? " Q4_1" : " UNKNOWN"
40
-
41
- #define TENSOR_DUMP (TENSOR ) printf(" %15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - " , #TENSOR, \
42
- TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
43
- (int ) TENSOR->ne[0 ], (int ) TENSOR->ne[1 ], (int ) TENSOR->ne[2 ], TENSOR->nb[0 ], TENSOR->nb[1 ], TENSOR->nb[2 ]); \
44
- { float sum = tensor_sum_elements (TENSOR); printf (" Sum of tensor %s is %6.2f\n " ,#TENSOR, sum); }
30
+ void tensor_dump (const ggml_tensor * tensor) {
31
+ printf (" %15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - " , " tensor" ,
32
+ tensor->type , ggml_type_name (tensor->type ),
33
+ (int ) tensor->ne [0 ], (int ) tensor->ne [1 ], (int ) tensor->ne [2 ], tensor->nb [0 ], tensor->nb [1 ], tensor->nb [2 ]);
34
+ float sum = tensor_sum_elements (tensor);
35
+ printf (" Sum of tensor %s is %6.2f\n " ," tensor" , sum);
36
+ }
45
37
46
38
struct benchmark_params_struct {
47
39
int32_t n_threads = 1 ;
@@ -59,8 +51,6 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para
59
51
}
60
52
61
53
int main (int argc, char ** argv) {
62
-
63
-
64
54
struct benchmark_params_struct benchmark_params;
65
55
66
56
bool invalid_param = false ;
@@ -84,11 +74,11 @@ int main(int argc, char ** argv) {
84
74
print_usage (argc, argv, benchmark_params);
85
75
exit (0 );
86
76
}
87
- if (invalid_param) {
88
- fprintf (stderr, " error: invalid parameter for argument: %s \n " , arg. c_str ());
89
- print_usage (argc, argv, benchmark_params );
90
- exit ( 1 );
91
- }
77
+ }
78
+ if (invalid_param) {
79
+ fprintf (stderr, " error: invalid parameter for argument: %s \n " , arg. c_str () );
80
+ print_usage (argc, argv, benchmark_params );
81
+ exit ( 1 );
92
82
}
93
83
94
84
fprintf (stderr, " %s: build = %d (%s)\n " , __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -165,12 +155,12 @@ int main(int argc, char ** argv) {
165
155
gf.n_threads =benchmark_params.n_threads ;
166
156
printf (" cgraph->n_threads=%i\n " ,gf.n_threads );
167
157
168
- TENSOR_DUMP (m11);
169
- TENSOR_DUMP (m2);
158
+ tensor_dump (m11);
159
+ tensor_dump (m2);
170
160
171
161
ggml_graph_compute (ctx, &gf);
172
162
173
- TENSOR_DUMP (gf.nodes [0 ]);
163
+ tensor_dump (gf.nodes [0 ]);
174
164
175
165
printf (" \n ------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n " );
176
166
@@ -216,9 +206,8 @@ int main(int argc, char ** argv) {
216
206
// Let's use the F32 result from above as a reference for the q4_0 multiplication
217
207
float sum_of_F32_reference = tensor_sum_elements (gf.nodes [0 ]);
218
208
219
-
220
- printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n " );
221
- printf (" ==============================================================================================\n " );
209
+ printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n " );
210
+ printf (" =====================================================================================\n " );
222
211
223
212
for (int i=0 ;i<benchmark_params.n_iterations ;i++) {
224
213
@@ -227,15 +216,15 @@ int main(int argc, char ** argv) {
227
216
ggml_graph_compute (ctx, &gf31);
228
217
long long int stop = ggml_time_us ();
229
218
long long int usec = stop-start;
230
- float flops_per_usec = (1 . 0f * flops_per_matrix)/usec;
231
- printf (" %9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19 .2f\n " ,
219
+ double gflops = (double )( flops_per_matrix)/usec/ 1000.0 ;
220
+ printf (" %9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10 .2f\n " ,
232
221
i,
233
222
gf31.n_threads ,
234
223
sizex, sizey, sizez, flops_per_matrix,
235
- usec,flops_per_usec );
224
+ usec,gflops );
236
225
237
226
#ifdef VERBOSE_DEBUGGING
238
- TENSOR_DUMP (" res" ,gf31.nodes [0 ])
227
+ tensor_dump (" res" ,gf31.nodes [0 ])
239
228
#endif
240
229
241
230
// Check that the matrix multiplication result is in the right ballpark
@@ -256,7 +245,5 @@ int main(int argc, char ** argv) {
256
245
257
246
// Running a different graph computation to make sure we override the CPU cache lines
258
247
ggml_graph_compute (ctx, &gf32);
259
-
260
248
}
261
-
262
249
}
0 commit comments