-
Notifications
You must be signed in to change notification settings - Fork 1
/
LLAMA2_0.bas
495 lines (432 loc) · 21.2 KB
/
LLAMA2_0.bas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
/'
INFERENCE FOR LLAMA-2 TRANSFORMER MODEL IN PURE FREEBASIC
EXAMPLE COMPILE:
]
]
'/
#include "crt/stdio.bi"
#include "crt/stdlib.bi"
#include "crt/time.bi"
#include "crt/math.bi"
#include "crt/string.bi"
'-------------------------------------------------------------------------------------------------
'TRANSFORMER AND RUNSTATE STRUCTS,AND RELATED MEMORY MANAGER
DIM SHARED AS INTEGER CONFIG_DIM_4B'1
DIM SHARED AS INTEGER CONFIG_HIDDEN_DIM_4B'2
DIM SHARED AS INTEGER CONFIG_N_LAYERS_4B'3
DIM SHARED AS INTEGER CONFIG_N_HEADS_4B'4
DIM SHARED AS INTEGER CONFIG_N_KV_HEADS_4B'5
DIM SHARED AS INTEGER CONFIG_VOCAB_SIZE_4B'6
DIM SHARED AS INTEGER CONFIG_SEQ_LEN_4B'7
'TOKEN EMBEDDING TABLE
DIM SHARED AS SINGLE PTR TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B '(VOCAB_SIZE,DIM)1
'WEIGHTS FOR RMSNORMS
DIM SHARED AS SINGLE PTR TransformerWeights_RMS_ATT_WEIGHT_f4B '(LAYER,DIM)2
DIM SHARED AS SINGLE PTR TransformerWeights_RMS_FFN_WEIGHT_f4B '(LAYER,DIM)3
'WEIGHTS FOR MATMULS
DIM SHARED AS SINGLE PTR TransformerWeights_WQ_f4B '(LAYER,DIM,DIM)4
DIM SHARED AS SINGLE PTR TransformerWeights_WK_f4B '(LAYER,DIM,DIM)5
DIM SHARED AS SINGLE PTR TransformerWeights_WV_f4B '(LAYER,DIM,DIM)6
DIM SHARED AS SINGLE PTR TransformerWeights_WO_f4B '(LAYER,DIM,DIM)7
'WEIGHTS FOR TTN
DIM SHARED AS SINGLE PTR TransformerWeights_W1_f4B '(LAYER,HIDDEN_DIM,DIM)8
DIM SHARED AS SINGLE PTR TransformerWeights_W2_f4B '(LAYER,DIM,HIDDEN_DIM)9
DIM SHARED AS SINGLE PTR TransformerWeights_W3_f4B '(LAYER,HIDDEN_DIM,DIM)10
'FINAL RMSNORM
DIM SHARED AS SINGLE PTR TransformerWeights_RMS_FINAL_WEIGHT_f4B '(LAYER,DIM,DIM)11
'FREQ_CIS FOR ROPE RELATIVELY POSITIONAL EMBEDDINDS
DIM SHARED AS SINGLE PTR TransformerWeights_FREQ_CIS_REAL_f4B '(LAYER,DIM,DIM)12
DIM SHARED AS SINGLE PTR TransformerWeights_FREQ_CIS_IMAG_f4B '(LAYER,DIM,DIM)13
'(OPTIONAL) CLASSIFIER WEIGHTS FOR THE LOGITS, ON THE LAST LAYER
DIM SHARED AS SINGLE PTR TransformerWeights_WCLS_f4B '(LAYER,DIM,DIM)14
DIM SHARED AS SINGLE PTR RunState_X_f4B '(LAYER,DIM,DIM) 1
DIM SHARED AS SINGLE PTR RunState_XB_f4B '(LAYER,DIM,DIM) 2
DIM SHARED AS SINGLE PTR RunState_XB2_f4B '(LAYER,DIM,DIM) 3
DIM SHARED AS SINGLE PTR RunState_HB_f4B '(LAYER,DIM,DIM) 4
DIM SHARED AS SINGLE PTR RunState_HB2_f4B '(LAYER,DIM,DIM) 5
DIM SHARED AS SINGLE PTR RunState_Q_f4B '(LAYER,DIM,DIM) 6
DIM SHARED AS SINGLE PTR RunState_K_f4B '(LAYER,DIM,DIM) 7
DIM SHARED AS SINGLE PTR RunState_V_f4B '(LAYER,DIM,DIM) 8
DIM SHARED AS SINGLE PTR RunState_ATT_f4B '(LAYER,DIM,DIM) 9
DIM SHARED AS SINGLE PTR RunState_LOGITS_f4B '(LAYER,DIM,DIM) 10
'KV CACHE
DIM SHARED AS SINGLE PTR RunState_KEY_CACHE_f4B '(LAYER,SEQ_LEN,DIM) 11
DIM SHARED AS SINGLE PTR RunState_VALUE_CACHE_f4B '(LAYER,SEQ_LEN,DIM) 12
SUB SUB_MALLOC_RUN_STATE()
'WA ALLOC INSTEAD OF MALLOC TO KEEP VALGRIND HAPPY
RunState_X_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))
RunState_XB_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))'4B
RunState_XB2_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))'4B
RunState_HB_f4B=CALLOCATE(CONFIG_HIDDEN_DIM_4B,SIZEOF(SINGLE))'4B
RunState_HB2_f4B=CALLOCATE(CONFIG_HIDDEN_DIM_4B,SIZEOF(SINGLE))'4B
RunState_Q_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))'4B
RunState_K_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))'4B
RunState_V_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))'4B
RunState_ATT_f4B=CALLOCATE(CONFIG_N_HEADS_4B*CONFIG_SEQ_LEN_4B,SIZEOF(SINGLE))'4B
RunState_LOGITS_f4B=CALLOCATE(CONFIG_VOCAB_SIZE_4B,SIZEOF(SINGLE))'4B
RunState_KEY_CACHE_f4B=CALLOCATE(CONFIG_N_LAYERS_4B*CONFIG_SEQ_LEN_4B*CONFIG_DIM_4B,SIZEOF(SINGLE))'4B
RunState_VALUE_CACHE_f4B=CALLOCATE(CONFIG_N_LAYERS_4B*CONFIG_SEQ_LEN_4B*CONFIG_DIM_4B,SIZEOF(SINGLE))'4B
'IF((NOT RunState_X_f4B)OR(NOT RunState_XB_f4B)OR(NOT RunState_XB2_f4B) _
'OR(NOT RunState_HB_f4B)OR(NOT RunState_HB2_f4B)OR(NOT RunState_Q_f4B) _
'OR(NOT RunState_K_f4B)OR(NOT RunState_V_f4B)OR(NOT RunState_ATT_f4B) _
'OR(NOT RunState_LOGITS_f4B)OR(NOT RunState_KEY_CACHE_f4B)OR(NOT RunState_VALUE_CACHE_f4B))THEN
' ?"MEMORY ALLOCATION FAILED!":EXIT
'ENDIF
END SUB
SUB SUB_FREE_RUN_STATE()
DEALLOCATE(RunState_X_f4B)'1
DEALLOCATE(RunState_XB_f4B)'2
DEALLOCATE(RunState_XB2_f4B)'3
DEALLOCATE(RunState_HB_f4B)'4
DEALLOCATE(RunState_HB2_f4B)'5
DEALLOCATE(RunState_Q_f4B)'6
DEALLOCATE(RunState_K_f4B)'7
DEALLOCATE(RunState_V_f4B)'8
DEALLOCATE(RunState_ATT_f4B)'9
DEALLOCATE(RunState_LOGITS_f4B)'10
DEALLOCATE(RunState_KEY_CACHE_f4B)'11
DEALLOCATE(RunState_VALUE_CACHE_f4B)'12
END SUB
'---------------------------------------------------
'INITIALIZATION READ THE CHECKPOINT
SUB SUB_CHECKPOINT_INIT_WEIGHTS(F_f4B AS SINGLE PTR,SHARED_WEIGHTS_4B AS INTEGER)':425:
DIM AS SINGLE PTR PTR_f4B:?" :116:PRE ":PTR_f4B=F_f4B:?":116: F[0]=";F_f4B[0];" PTR_f4B=";PTR_f4B[0]
TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B=PTR_f4B:?":117:TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B[0]=";TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B[0]
PTR_f4B+=CONFIG_VOCAB_SIZE_4B*CONFIG_DIM_4B
TransformerWeights_RMS_ATT_WEIGHT_f4B=PTR_f4B
PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B
TransformerWeights_WQ_f4B=PTR_f4B
PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_DIM_4B
TransformerWeights_WK_f4B=PTR_f4B
PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_DIM_4B
TransformerWeights_WV_f4B=PTR_f4B
PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_DIM_4B
TransformerWeights_WO_f4B=PTR_f4B
PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_DIM_4B
TransformerWeights_RMS_FFN_WEIGHT_f4B=PTR_f4B
PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B
TransformerWeights_W1_f4B=PTR_f4B
PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_HIDDEN_DIM_4B
TransformerWeights_W2_f4B=PTR_f4B
PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_HIDDEN_DIM_4B*CONFIG_DIM_4B
TransformerWeights_W3_f4B=PTR_f4B
PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_HIDDEN_DIM_4B
TransformerWeights_RMS_FINAL_WEIGHT_f4B=PTR_f4B
PTR_f4B+=CONFIG_DIM_4B
TransformerWeights_FREQ_CIS_REAL_f4B=PTR_f4B
DIM AS INTEGER HEAD_SIZE_4B=CONFIG_DIM_4B / CONFIG_N_HEADS_4B
DIM TEMP AS INTEGER:TEMP=CONFIG_SEQ_LEN_4B*HEAD_SIZE_4B/2:PTR_f4B+=TEMP
TransformerWeights_FREQ_CIS_IMAG_f4B=PTR_f4B
TEMP=CONFIG_SEQ_LEN_4B*HEAD_SIZE_4B/2:PTR_f4B+=TEMP
'TransformerWeights_WCLS_f4B=SHARED_WEIGHTS ? TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B : PTR_f4B
IF SHARED_WEIGHTS_4B=1 THEN TransformerWeights_WCLS_f4B=TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B ELSE TransformerWeights_WCLS_f4B=PTR_f4B ENDIF
END SUB
'---------------------------
'NEURAL NET BLOCKS
SUB SUB_ACCUM( A_f4B AS SINGLE PTR ,B_f4B AS SINGLE PTR,SIZE_4B AS INTEGER)
DIM I AS INTEGER:FOR I=0 TO SIZE_4B-1
A_f4B[I]+=B_f4B[I]
NEXT I
END SUB
SUB SUB_RMSNORM(OUT_O_f4B AS SINGLE PTR,X_f4B AS SINGLE PTR,WEIGHT_f4B AS SINGLE PTR,SIZE_IN_4B AS INTEGER)
'?"-- :157: IN X_f4B[0]=";X_f4B[0];:'OUT_O_f4B=CALLOCATE(288+2)'CALCULATE SUM SQUARES
DIM AS SINGLE SS_f4B=0:DIM AS INTEGER J
FOR J=0 TO SIZE_IN_4B-1
SS_f4B+=X_f4B[J]*X_f4B[J]
NEXT J
SS_f4B/=SIZE_IN_4B :'?":162: SS_f4B=";SS_f4B;" ";
SS_f4B+=0.00001 ' :?":163: SS_f4B=";SS_f4B '1e-5f IS 0.
SS_f4B=1.0/SQR(SS_f4B)
'?":165:SIZE_IN_4B=";SIZE_IN_4B'NORMALIZE AND SCALE
FOR J=0 TO SIZE_IN_4B-1
OUT_O_f4B[J]=WEIGHT_f4B[J]*(SS_f4B*X_f4B[J]) ':?" OUT_O_f4B[J]=";OUT_O_f4B[J]
NEXT J
END SUB
SUB SUB_SOFTMAX(X_f4B AS SINGLE PTR,IN_SIZE_4B AS INTEGER)
'FIND MAX VALUE (FOR NUMERICAL STABILITY)
DIM AS SINGLE MAX_VAL_f4B=X_f4B[0]:DIM I AS UINTEGER
FOR I=1 TO IN_SIZE_4B-1
IF X_f4B[I]>MAX_VAL_f4B THEN
MAX_VAL_f4B=X_f4B[I]
ENDIF
NEXT I
'EXP AND SUM
DIM AS SINGLE SUM_f4B=0.0
FOR I=0 TO IN_SIZE_4B-1
X_f4B[I]=EXP(X_f4B[I]-MAX_VAL_f4B)
SUM_f4B+=X_f4B[I]
NEXT I
'NORMALIZE
FOR I=0 TO IN_SIZE_4B-1
X_f4B[I]/=SUM_f4B
NEXT I
END SUB
SUB SUB_MATMUL(XOUT_f4B AS SINGLE PTR,X_f4B AS SINGLE PTR,W_f4B AS SINGLE PTR,IN_N_4B AS INTEGER,IN_D_4B AS INTEGER)
'?":192:"' W (D,N) @ X (N,) -> XOUT (D,)
DIM I AS INTEGER:'?"IN_D_4B=";IN_D_4B;" IN_N_4B=";IN_N_4B
FOR I=0 TO IN_D_4B-1:'?":194: I=";I
DIM AS SINGLE VAL_f4B=0:DIM J AS INTEGER
FOR J=0 TO IN_N_4B-1':?":196:";" I=";I;" IN_N_4B=";IN_N_4B;" J=";J;" W_f4B[0]=";W_f4B[0];" X_f4B[0]=";X_f4B[0];" W_f4B[0]*X_f4B[0]=";W_f4B[0]*X_f4B[0]
VAL_f4B+=W_f4B[I*IN_N_4B+J]*X_f4B[J]
NEXT J
XOUT_f4B[I]=VAL_f4B
NEXT I:'?":200: XOUT_f4B[287]=";XOUT_f4B[287]
END SUB
SUB SUB_TRANSFORMER(TOKEN_4B AS INTEGER,POS_4B AS INTEGER)
'?":204: TOKEN_4B=";TOKEN_4B;" POS_4B=";POS_4B:COLOR 3:?"BEGIN SUB_TRANSFORMER()":COLOR 7
' A FEW CONVENIENCE VARIABLES
DIM AS SINGLE PTR X_f4B=RunState_X_f4B
DIM AS INTEGER DIM_4B=CONFIG_DIM_4B:'?":207: DIM_4B=";DIM_4B;
DIM AS INTEGER HIDDEN_DIM_4B=CONFIG_HIDDEN_DIM_4B:'?" HIDDEN_DIM_4B=";HIDDEN_DIM_4B;
DIM AS INTEGER HEAD_SIZE_4B=DIM_4B/CONFIG_N_HEADS_4B:'?" HEAD_SIZE_4B=";HEAD_SIZE_4B
'?":210:";"RunState_X_f4B=";RunState_X_f4B
'COPY THE TOKEN EMBEDDING INTO X
DIM AS SINGLE PTR CONTENT_ROW_f4B:CONTENT_ROW_f4B=@TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B[TOKEN_4B*CONFIG_DIM_4B]: '?":212: CONTENT_ROW_f4B=";CONTENT_ROW_f4B[0]
X_f4B=ALLOCATE(DIM_4B*SIZEOF(*X_f4B)):MEMCPY(X_f4B,CONTENT_ROW_f4B,DIM_4B*SIZEOF(*X_f4B) )
DIM TEMP_POS_4B AS UINTEGER :TEMP_POS_4B=(POS_4B*HEAD_SIZE_4B/2)
'PLUCK OUT THE "POS" ROW OF FREQ_CIS_REAL AND FREQ_CIS_IMAG
DIM AS SINGLE PTR FREQ_CIS_REAL_ROW_f4B=TransformerWeights_FREQ_CIS_REAL_f4B+TEMP_POS_4B
DIM AS SINGLE PTR FREQ_CIS_IMAG_ROW_f4B=TransformerWeights_FREQ_CIS_IMAG_f4B+TEMP_POS_4B
DIM L AS INTEGER
'FORWARD ALL THE LAYERS
FOR L=0 TO CONFIG_N_LAYERS_4B-1
'?":221:";:COLOR 6:?" L=";L:COLOR 7
'?":222: RunState_XB_f4B[0]=";RunState_XB_f4B[0]
SUB_RMSNORM(RunState_XB_f4B,X_f4B,TransformerWeights_RMS_ATT_WEIGHT_f4B+L*DIM_4B,DIM_4B)':156:
'?":224: RunState_XB_f4B[0]=";RunState_XB_f4B[0]
'QKV MATMUL FOR THIS POSITION
SUB_MATMUL(RunState_Q_f4B,RunState_XB_f4B,TransformerWeights_WQ_f4B+L*DIM_4B*DIM_4B,DIM_4B,DIM_4B)':191:
SUB_MATMUL(RunState_K_f4B,RunState_XB_f4B,TransformerWeights_WK_f4B+L*DIM_4B*DIM_4B,DIM_4B,DIM_4B)':191:
SUB_MATMUL(RunState_V_f4B,RunState_XB_f4B,TransformerWeights_WV_f4B+L*DIM_4B*DIM_4B,DIM_4B,DIM_4B)':191:
DIM AS UINTEGER H,I
'APPLY ROPE ROTATION TO THE Q AND K VECTORS FOR EACH HEAD
FOR H=0 TO CONFIG_N_HEADS_4B-1
'GET THE Q AND K VECTORS FOR THIS HEAD
DIM AS SINGLE PTR Q_f4B=RunState_Q_f4B+H*HEAD_SIZE_4B
DIM AS SINGLE PTR K_f4B=RunState_K_f4B+H*HEAD_SIZE_4B
'ROTATE Q AND K BY THE FREQ_CIS_REAL AND FREQ_CIS_IMAG
FOR I=0 TO HEAD_SIZE_4B-1 STEP 2
DIM AS SINGLE Q0_f4B=Q_f4B[I]
DIM AS SINGLE Q1_f4B=Q_f4B[I+1]
DIM AS SINGLE K0_f4B=K_f4B[I]
DIM AS SINGLE K1_f4B=K_f4B[I+1]
DIM AS SINGLE FCR_f4B=FREQ_CIS_REAL_ROW_f4B[I/2]
DIM AS SINGLE FCI_f4B=FREQ_CIS_IMAG_ROW_f4B[I/2]
Q_f4B[I]=Q0_f4B*FCR_f4B-Q1_f4B*FCI_f4B
Q_f4B[I+1]=Q0_f4B*FCI_f4B+Q1_f4B*FCR_f4B
K_f4B[I]=K0_f4B*FCR_f4B-K1_f4B*FCI_f4B
K_f4B[I+1]=K0_f4B*FCI_f4B+K1_f4B*FCR_f4B
NEXT I
NEXT H
'SAVE KEY,VALUE AT THIS TIME STEP (POS) TO OUR KV CACHE
DIM AS INTEGER LOFF_4B=L*CONFIG_SEQ_LEN_4B*DIM_4B
DIM AS SINGLE PTR KEY_CACHE_ROW_f4B=RunState_KEY_CACHE_f4B+LOFF_4B+POS_4B*DIM_4B
DIM AS SINGLE PTR VALUE_CACHE_ROW_f4B=RunState_VALUE_CACHE_f4B+LOFF_4B+POS_4B*DIM_4B
MEMCPY(KEY_CACHE_ROW_f4B,RunState_K_f4B,DIM_4B*SIZEOF(*KEY_CACHE_ROW_f4B))
MEMCPY(VALUE_CACHE_ROW_f4B,RunState_V_f4B,DIM_4B*SIZEOF(*VALUE_CACHE_ROW_f4B))
'MULTIHEAD ATTENTION. ITERATE OVER ALL HEADS
'#PRAGMA OMP PARALLEL FOR
FOR H=0 TO CONFIG_N_HEADS_4B-1:':?":259: RunState_XB_f4B[0]=";RunState_XB_f4B[0]
'GET THE QUERY VECTOR FOR THIS HEAD
DIM AS SINGLE PTR Q_f4B=RunState_Q_f4B+H*HEAD_SIZE_4B
'ATTENTION SCORES FOR THIS HEAD
DIM AS SINGLE PTR ATT_f4B=RunState_ATT_f4B+H*CONFIG_SEQ_LEN_4B:DIM T AS UINTEGER
'ITERATE OVER ALL TIMESTEPS, INCLUDING THE CURRENT ONE
FOR T=0 TO POS_4B
'GET THE KEY VECTOR FOR THIS HEAD AND AT THIS TIMESTEP
DIM AS SINGLE PTR K_f4B=RunState_KEY_CACHE_f4B+LOFF_4B+T*DIM_4B+H*HEAD_SIZE_4B
'CALCULATE THE ATTENTION SCORE AT THE DOS PRODUCT OF Q AND K
DIM AS SINGLE SCORE_f4B=0
FOR I=0 TO HEAD_SIZE_4B-1
SCORE_f4B+=Q_f4B[I]*K_f4B[I]
NEXT I:'?":272: SCORE_f4B=";SCORE_f4B
SCORE_f4B/=SQR(HEAD_SIZE_4B):'?":273: HEAD_SIZE_4B=";HEAD_SIZE_4B
'SAVE THE SCORE TO THE ATTENTION BUFFER
ATT_f4B[T]=SCORE_f4B:'?":275: SCORE_f4B=";SCORE_f4B;" T=";T
NEXT T
'SOFTMAX THE SCORES TO GET ATTENTION WEIGHTS, FROM 0..POS INCLUSIVELY
SUB_SOFTMAX(ATT_f4B,POS_4B+1)':171:
'?":280: ATT_f4B[0]=";ATT_f4B[0]
'WEIGHTED SUM OF THE VALUES, STORE BACK INTO XB
DIM AS SINGLE PTR XB_f4B=RunState_XB_f4B+H*HEAD_SIZE_4B:'?":282: RunState_XB_f4B[0]=";RunState_XB_f4B[0]
MEMSET(XB_f4B,0,HEAD_SIZE_4B*SIZEOF(SINGLE))
FOR T=0 TO POS_4B
'?":285: POS_4B=";POS_4B 'GET THE VALUE VECTOR FOR THIS HEAD AND AT THIS TIME STEP
DIM AS SINGLE PTR V_f4B=RunState_VALUE_CACHE_f4B+LOFF_4B+T*DIM_4B+H*HEAD_SIZE_4B
'GET THE ATTENTION WEIGHT FOR THIS TIMESTEP
DIM AS SINGLE A_f4B=ATT_f4B[T]:'?":288: A_f4B=";A_f4B
'ACCUMULATE THE WEIGHTED VALUE INTO XB
FOR I=0 TO HEAD_SIZE_4B-1
XB_f4B[I]+=A_f4B*V_f4B[I]
NEXT I
NEXT T:'?":293: RunState_XB_f4B[0]=";RunState_XB_f4B[0]
NEXT H
'?":295: TransformerWeights_WO_f4B[0]+(L*DIM_4B*DIM_4B)=";TransformerWeights_WO_f4B[0]+(L*DIM_4B*DIM_4B)
'?":296: RunState_XB_f4B[0]=";RunState_XB_f4B[0]'FINAL MATMUL TO GET THE OUTPUT OF THE ATTENTION
SUB_MATMUL(RunState_XB2_f4B,RunState_XB_f4B,TransformerWeights_WO_f4B+(L*DIM_4B*DIM_4B),DIM_4B,DIM_4B)
'?":298: X_f4B[0]=";X_f4B[0];" RunState_XB2_f4B[0]=";RunState_XB2_f4B[0];" IN DIM_4B=";DIM_4B
'RESIDUAL CONNECTION BACK INTO X
SUB_ACCUM(X_f4B,RunState_XB2_f4B,DIM_4B)':150:
'?":301: OUT X_f4B[0]=";X_f4B[0];" IN RunState_XB2_f4B[0]=";RunState_XB2_f4B[0];" IN DIM_4B=";DIM_4B
'FFN RMSNORM (2 TIME)
SUB_RMSNORM(RunState_XB_f4B,X_f4B,TransformerWeights_RMS_FFN_WEIGHT_f4B+L*DIM_4B,DIM_4B)':156:
'?":304: X_f4B[0]=";X_f4B[0]
' NOW FOR FFN IN PYTORCH WE HAVE: SELF.W2(F.SILU(SELF.W1(X))*SELF.W3(X))
'?":306: OUT RunState_HB_f4B[0]=";RunState_HB_f4B[0];" IN RunState_XB_f4B[0]=";RunState_XB_f4B[0]' FIRST CALCULATE SELF.W1(X) AND SELF.W3(X)
SUB_MATMUL(RunState_HB_f4B,RunState_XB_f4B,TransformerWeights_W1_f4B+L*DIM_4B*HIDDEN_DIM_4B,DIM_4B,HIDDEN_DIM_4B):'?":307: OUT RunState_HB_f4B[0]=";RunState_HB_f4B[0];" IN RunState_XB_f4B[0]=";RunState_XB_f4B[0]
SUB_MATMUL(RunState_HB2_f4B,RunState_XB_f4B,TransformerWeights_W3_f4B+L*DIM_4B*HIDDEN_DIM_4B,DIM_4B,HIDDEN_DIM_4B)
'F.SILU; SILU(X)=X*O(X),WHERE O(X) IS THE LOGISTIC SIGMOID
FOR I=0 TO HIDDEN_DIM_4B-1
RunState_HB_f4B[I]=RunState_HB_f4B[I]*(1.0/(1.0+EXP(-RunState_HB_f4B[I])))
NEXT I
'ELEMENTWISE MULTIPLY WITH W3(X)
FOR I=0 TO HIDDEN_DIM_4B-1
RunState_HB_f4B[I]=RunState_HB_f4B[I]*RunState_HB2_f4B[I]
NEXT I
'FINAL MATMUT TO GET THE OUTPUT OF THE FFN
SUB_MATMUL(RunState_XB_f4B,RunState_HB_f4B,TransformerWeights_W2_f4B+L*DIM_4B*HIDDEN_DIM_4B,HIDDEN_DIM_4B,DIM_4B)
'?":322: X_f4B[0]=";X_f4B[0]
'RESIDUAL CONNECTION
SUB_ACCUM(X_f4B,RunState_XB_f4B,DIM_4B):'?":324: X_f4B[0]=";X_f4B[0]
NEXT L
'?":327: X_f4B[0]=";X_f4B[0]'FINAL RMSNORM
SUB_RMSNORM(X_f4B,X_f4B,TransformerWeights_RMS_FINAL_WEIGHT_f4B,DIM_4B)':156:
'?":330: X_f4B[0]=";X_f4B[0]'CLASSIFIER INFO LOGITS
SUB_MATMUL(RunState_LOGITS_f4B,X_f4B,TransformerWeights_WCLS_f4B,CONFIG_DIM_4B,CONFIG_VOCAB_SIZE_4B)':191:
END SUB 'END SUB_TRANSFORMER
FUNCTION FUNC_SAMPLE(PROBABILITIES_f4B AS SINGLE PTR,N_IN_4B AS INTEGER)AS INTEGER
'SAMPLE INDEX FROM PROBALILITIES,THEY MUST SUM TO 1
DIM AS DOUBLE R_f4B=RND'/2147483648.0 'RAND_MAX=&H7FFFFFFF
DIM AS SINGLE CDF_f4B=0:DIM I AS UINTEGER:'?":337: R_f4B=";R_f4B;" N_IN_4B=";N_IN_4B
FOR I=0 TO N_IN_4B-1
CDF_f4B+=PROBABILITIES_f4B[I]
IF R_f4B<CDF_f4B THEN
RETURN I
ENDIF
NEXT I
RETURN N_IN_4B-1 ' IN CASE OF ROUNDING ERRORS
END FUNCTION
FUNCTION FUNC_ARGMAX(V AS SINGLE PTR,IN_N_4B AS INTEGER)AS INTEGER
'RETURN ARGMAX OF V IN ELEMENTS 0..N
DIM AS INTEGER MAX_I_4B=0,I
DIM AS SINGLE MAX_P_f4B=V[0]:'?":350: IN_N_4B=";IN_N_4B;" MAX_P_f4B=";MAX_P_f4B
FOR I=1 TO IN_N_4B-1':?":351: V[";I;"]=";V[I],
IF V[I]>MAX_P_f4B THEN
MAX_I_4B=I
MAX_P_f4B=V[I]
ENDIF
NEXT I
RETURN MAX_I_4B
END FUNCTION
'---------------------------
FUNCTION FUNC_TIME_IN_MS()AS UINTEGER
DIM AS DOUBLE TIME_4B=TIMER()
'GET THE CURRENT TIME WITH NANOSECONDS PRECISION
'IF
RETURN TIME_4B
'ELSE
'RETURN -1 'RETURN -1 TO INDICATE AN ERROR
'ENDIF
END FUNCTION
'MAIN
'POOR MAN'S C ARGPARSE
DIM AS STRING CHECKPOINT_STR 'E.G. OUT/MODEL.BIN
DIM AS SINGLE TEMPERATURE_f4B=0.9'E.G. 1.0 TILL 0.0
DIM AS INTEGER STEPS_4B=256
' "CHECKPOINT" IS NECESSARY ARG
IF COMMAND(1)="" THEN
?"USAGE:";COMMAND(0);" < CHECKPOINT_FILE > [ TEMPERATURE ] [ STEPS ]"
?"PRESS ANY KEY TO EXIT.":END
ENDIF
IF COMMAND(1)>"" THEN
CHECKPOINT_STR=COMMAND(1)
ENDIF
IF COMMAND(2)>"" THEN
'OPTIONAL TEMPERATURE. 0.0=(DETERMINISTIC) ARGMAX SAMPLING.1.0=BASELINE
TEMPERATURE_f4B=ATOF(COMMAND(2))
ENDIF
IF COMMAND(3)>"" THEN
STEPS_4B=ATOI(COMMAND(3))
ENDIF
'SEED RNG WITH TIME. IF YOU WANT DETERMINISTIC BEHAVIOR USE TEMPERATURE 0.0
'SRAND(TIME(NULL))
'READ IN THE MODEL.BIN FILE
?":399: CHECKPOINT_STR=";CHECKPOINT_STR:?":399: TEMPERATURE_f4B=";TEMPERATURE_f4B:?":399: STEPS_4B=";STEPS_4B
DIM FD_4B AS INTEGER
DIM DATA_f4B AS SINGLE PTR
DIM FILE_SIZE_4B AS LONG
DIM AS FILE PTR FILE_PTR
FILE_PTR=FOPEN(CHECKPOINT_STR,"rb")
IF FILE_PTR=0 THEN COLOR 4:?"FILE '";CHECKPOINT_STR;"' OPENNING ERROR" ELSE COLOR 2:?":407: FILE '";CHECKPOINT_STR;"' OPENED " ENDIF
COLOR 7
'READ THE CONFIG HEADER
FREAD(@CONFIG_DIM_4B,4,1,FILE_PTR):FREAD(@CONFIG_HIDDEN_DIM_4B,4,1,FILE_PTR):FREAD(@CONFIG_N_LAYERS_4B,4,1,FILE_PTR):FREAD(@CONFIG_N_HEADS_4B,4,1,FILE_PTR):FREAD(@CONFIG_N_KV_HEADS_4B,4,1,FILE_PTR):FREAD(@CONFIG_VOCAB_SIZE_4B,4,1,FILE_PTR):FREAD(@CONFIG_SEQ_LEN_4B,4,1,FILE_PTR)
?":412: DIM_4B=";CONFIG_DIM_4B;" HIDDEN_DIM_4B=";CONFIG_HIDDEN_DIM_4B;" N_LAYERS_4B_4B=";CONFIG_N_LAYERS_4B;" N_HEADS_4B=";CONFIG_N_HEADS_4B," N_KV_HEADS_4B=";CONFIG_N_KV_HEADS_4B;" VOCAB_SIZE_4B=";CONFIG_VOCAB_SIZE_4B;" SEQ_LEN_4B=";CONFIG_SEQ_LEN_4B 'NEGATIVE VOCAB SIZE IS HACKY WAY OF SIGNALING UNSHARED WEIGHTS.BIT YIKES
DIM AS INTEGER SHARED_WEIGHTS_4B:IF CONFIG_VOCAB_SIZE_4B>0 THEN SHARED_WEIGHTS_4B=1 ELSE SHARED_WEIGHTS_4B=0 ENDIF
CONFIG_VOCAB_SIZE_4B=ABS(CONFIG_VOCAB_SIZE_4B)
'FIGURE OUT THE FILE SIZE
FSEEK(FILE_PTR,0,SEEK_END)'MOVE FILE POINTER TO THE END OF FILE
FILE_SIZE_4B=FTELL(FILE_PTR):?":417: FILE_SIZE_4B=";FILE_SIZE_4B'GET FILE SIZE IN BYTES
FCLOSE(FILE_PTR)
'MEMORY MAP THE TRANSFORMER WEIGHTS INTO THE DATA POINTER
FD_4B=FREEFILE:OPEN CHECKPOINT_STR FOR BINARY AS #FD_4B
IF ERR>0 THEN COLOR 4:? "Error opening the '";CHECKPOINT_STR;"' file" ELSE COLOR 2:?":421: FILE '";CHECKPOINT_STR;"' OPENED":COLOR 7
DATA_F4B=Allocate(FILE_SIZE_4B*4):GET #FD_4B, ,*DATA_F4B, FILE_SIZE_4B
DIM AS SINGLE PTR WEIGHTS_PTR:WEIGHTS_PTR=Allocate(FILE_SIZE_4B*4):WEIGHTS_PTR=DATA_F4B+7
SUB_CHECKPOINT_INIT_WEIGHTS(WEIGHTS_PTR,SHARED_WEIGHTS_4B)':115:
'RIGHT NOW WE CANNOT RUN FOR MORE THAN CONFIG.SEQ_LEN STEPS
IF STEPS_4B<=0 OR STEPS_4B>CONFIG_SEQ_LEN_4B THEN STEPS_4B=CONFIG_SEQ_LEN_4B
'READ IN THE TOKENIZER.BIN FILE
DIM AS ZSTRING PTR VOCAB_STR(0 TO CONFIG_VOCAB_SIZE_4B)
DIM AS ZSTRING PTR TEMP_VOCAB_STR
FILE_PTR=FOPEN("TOKENIZER.BIN","rb")
IF FILE_PTR=0 THEN COLOR 4:?"UNABLE TO OPEN THE TOKENIZER FILE 'TOKENIZER.BIN'! RUN""PYTHON 'TOKENIZER.PY' TO CONVERT 'TOKENIZER.MODEL' -> 'TOKENIZER.BIN'" ELSE COLOR 2:?":434: FILE 'TOKENIZER.BIN' OPENED " ENDIF:COLOR 7
?":438: CONFIG_VOCAB_SIZE_4B=";CONFIG_VOCAB_SIZE_4B
DIM AS INTEGER LEN_4B,I
FOR I=0 TO CONFIG_VOCAB_SIZE_4B
FREAD(@LEN_4B,4,1,FILE_PTR)
TEMP_VOCAB_STR=ALLOCATE(LEN_4B)
VOCAB_STR(I)=ALLOCATE(LEN_4B+4)
FREAD(TEMP_VOCAB_STR,LEN_4B,1,FILE_PTR):*VOCAB_STR(I)=LEFT(*TEMP_VOCAB_STR,LEN_4B)+"\0"
NEXT I:?":445:"
FCLOSE(FILE_PTR)
' CREATE AND INIT THE APPLICATION RUN STATE
SUB_MALLOC_RUN_STATE()
'THE CURRENT POSITION WE ARE IN
DIM AS LONG START_4B=FUNC_TIME_IN_MS()
DIM AS INTEGER NEXT_4B
DIM AS INTEGER TOKEN_4B=1
DIM AS INTEGER POS_4B=0
?"<S>" 'EXPLICIT PRINT THE INITIAL BIS TOKEN (=1),STYLISTICALLY SYMMETRIC
WHILE POS_4B<STEPS_4B
'?":460:";" POS_4B=";POS_4B;" STEPS=";STEPS_4B
'TEMPERATURE_f4B=0'FORWARD THE TRANSFORMER TO GET LOGITS FOR THE NEXT TOKEN
SUB_TRANSFORMER(TOKEN_4B,POS_4B)' :203:
'
'SAMPLE THE NEXT TOKEN
IF TEMPERATURE_f4B=0.0 THEN
'?":466: RunState_LOGITS_f4B[0]=";RunState_LOGITS_f4B[0]'GREEDY ARGMAX SAMPLING
NEXT_4B=FUNC_ARGMAX(RunState_LOGITS_f4B,CONFIG_VOCAB_SIZE_4B):'?":467: RunState_LOGITS_f4B[0]=";RunState_LOGITS_f4B[0]':347:
ELSE
'APPLY THE TEMPERATURE TO THE LOGITS
DIM AS INTEGER Q:FOR Q=0 TO CONFIG_VOCAB_SIZE_4B-1:RunState_LOGITS_f4B[Q]/=TEMPERATURE_f4B: NEXT Q
'APPLY SOFTMAX TO THE LOGITS TO GET THE PROBABILITIES FOR NEXT TOKEN
SUB_SOFTMAX(RunState_LOGITS_f4B,CONFIG_VOCAB_SIZE_4B)':171:
'WE NOW WANT TO SAMPLE FROM THIS DISTRIBUTION TO GET THE NEXT TOKEN
NEXT_4B=FUNC_SAMPLE(RunState_LOGITS_f4B,CONFIG_VOCAB_SIZE_4B)':334:
ENDIF
COLOR &HC:? LEFT(*VOCAB_STR(NEXT_4B),LEN(*VOCAB_STR(NEXT_4B))-2);:COLOR 7:'?" NEXT_4B=";NEXT_4B '4=RED 7=WHITE
'COLOR 3:?":477:":COLOR 7
'ADVANCE FORWARD
TOKEN_4B=NEXT_4B
POS_4B+=1
WEND
'REPORT ACHIEVED TOK/S
DIM AS LONG END_4B=FUNC_TIME_IN_MS()
?:?"ACHIEVED TOK/S:";(STEPS_4B/( END_4B-START_4B ) )
'MEMORY AND FAILE HANDLES CLEANUP
'?":489: STEPS_4B=";STEPS_4B
'?":490: START_4B=";START_4B'in :454:line start time was readed
'?":491: END_4B=";END_4B 'in :485:line end time was readed
END
'END MAIN