@@ -82,7 +82,7 @@ struct llama_hparams {
82
82
uint32_t n_head = 32 ;
83
83
uint32_t n_layer = 32 ;
84
84
uint32_t n_rot = 64 ;
85
- uint32_t f16 = 1 ;
85
+ enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16 ;
86
86
87
87
bool operator !=(const llama_hparams & other) const {
88
88
return memcmp (this , &other, sizeof (llama_hparams));
@@ -432,7 +432,7 @@ struct llama_file_loader {
432
432
hparams.n_head = file.read_u32 ();
433
433
hparams.n_layer = file.read_u32 ();
434
434
hparams.n_rot = file.read_u32 ();
435
- hparams.f16 = file.read_u32 ();
435
+ hparams.ftype = ( enum llama_ftype) file.read_u32 ();
436
436
}
437
437
void read_vocab () {
438
438
vocab.id_to_token .resize (hparams.n_vocab );
@@ -458,20 +458,21 @@ struct llama_file_loader {
458
458
llama_load_tensor_shard shard;
459
459
uint32_t n_dims = file.read_u32 ();
460
460
uint32_t name_len = file.read_u32 ();
461
- uint32_t ftype = file.read_u32 ();
461
+ shard. type = ( enum ggml_type) file.read_u32 ();
462
462
shard.ne .resize (n_dims);
463
463
file.read_raw (shard.ne .data (), sizeof (shard.ne [0 ]) * n_dims);
464
464
std::string name = file.read_string (name_len);
465
465
if (n_dims < 1 || n_dims > 2 ) {
466
466
throw format (" llama.cpp: tensor '%s' should not be %u-dimensional" , name.c_str (), n_dims);
467
467
}
468
- switch (ftype) {
469
- case 0 : shard.type = GGML_TYPE_F32; break ;
470
- case 1 : shard.type = GGML_TYPE_F16; break ;
471
- case 2 : shard.type = GGML_TYPE_Q4_0; break ;
472
- case 3 : shard.type = GGML_TYPE_Q4_1; break ;
468
+ switch (shard.type ) {
469
+ case GGML_TYPE_F32:
470
+ case GGML_TYPE_F16:
471
+ case GGML_TYPE_Q4_0:
472
+ case GGML_TYPE_Q4_1:
473
+ break ;
473
474
default : {
474
- throw format (" unrecognized ftype %u\n " , ftype );
475
+ throw format (" unrecognized tensor type %u\n " , shard. type );
475
476
}
476
477
}
477
478
@@ -502,26 +503,26 @@ struct llama_file_loader {
502
503
struct llama_file_saver {
503
504
llama_file file;
504
505
llama_file_loader * any_file_loader;
505
- llama_file_saver (const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16 )
506
+ llama_file_saver (const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype )
506
507
: file(fname, " wb" ), any_file_loader(any_file_loader) {
507
508
fprintf (stderr, " llama.cpp: saving model to %s\n " , fname);
508
509
write_magic ();
509
- write_hparams (new_f16 );
510
+ write_hparams (new_ftype );
510
511
write_vocab ();
511
512
}
512
513
void write_magic () {
513
514
file.write_u32 (' ggjt' ); // magic
514
515
file.write_u32 (1 ); // version
515
516
}
516
- void write_hparams (uint32_t new_f16 ) {
517
+ void write_hparams (enum llama_ftype new_ftype ) {
517
518
const llama_hparams & hparams = any_file_loader->hparams ;
518
519
file.write_u32 (hparams.n_vocab );
519
520
file.write_u32 (hparams.n_embd );
520
521
file.write_u32 (hparams.n_mult );
521
522
file.write_u32 (hparams.n_head );
522
523
file.write_u32 (hparams.n_layer );
523
524
file.write_u32 (hparams.n_rot );
524
- file.write_u32 (new_f16 );
525
+ file.write_u32 (new_ftype );
525
526
}
526
527
void write_vocab () {
527
528
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
@@ -536,17 +537,17 @@ struct llama_file_saver {
536
537
}
537
538
}
538
539
void write_tensor (llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
539
- uint32_t ftype;
540
540
switch (new_type) {
541
- case GGML_TYPE_F32: ftype = 0 ; break ;
542
- case GGML_TYPE_F16: ftype = 1 ; break ;
543
- case GGML_TYPE_Q4_0: ftype = 2 ; break ;
544
- case GGML_TYPE_Q4_1: ftype = 3 ; break ;
541
+ case GGML_TYPE_F32:
542
+ case GGML_TYPE_F16:
543
+ case GGML_TYPE_Q4_0:
544
+ case GGML_TYPE_Q4_1:
545
+ break ;
545
546
default : LLAMA_ASSERT (false );
546
547
}
547
548
file.write_u32 ((uint32_t ) tensor.ne .size ());
548
549
file.write_u32 ((uint32_t ) tensor.name .size ());
549
- file.write_u32 (ftype );
550
+ file.write_u32 (new_type );
550
551
file.write_raw (tensor.ne .data (), sizeof (tensor.ne [0 ]) * tensor.ne .size ());
551
552
file.write_raw (tensor.name .data (), tensor.name .size ());
552
553
file.seek (-file.tell () & 31 , SEEK_CUR);
@@ -820,6 +821,16 @@ static const char *llama_file_version_name(llama_file_version version) {
820
821
}
821
822
}
822
823
824
+ static const char *llama_ftype_name (enum llama_ftype ftype) {
825
+ switch (ftype) {
826
+ case LLAMA_FTYPE_ALL_F32: return " all F32" ;
827
+ case LLAMA_FTYPE_MOSTLY_F16: return " mostly F16" ;
828
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return " mostly Q4_0" ;
829
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return " mostly Q4_1" ;
830
+ default : LLAMA_ASSERT (false );
831
+ }
832
+ }
833
+
823
834
static const char *llama_model_type_name (e_model type) {
824
835
switch (type) {
825
836
case MODEL_7B: return " 7B" ;
@@ -872,7 +883,7 @@ static void llama_model_load_internal(
872
883
fprintf (stderr, " %s: n_head = %u\n " , __func__, hparams.n_head );
873
884
fprintf (stderr, " %s: n_layer = %u\n " , __func__, hparams.n_layer );
874
885
fprintf (stderr, " %s: n_rot = %u\n " , __func__, hparams.n_rot );
875
- fprintf (stderr, " %s: f16 = %u\n " , __func__, hparams.f16 );
886
+ fprintf (stderr, " %s: ftype = %u (%s) \n " , __func__, hparams.ftype , llama_ftype_name (hparams. ftype ) );
876
887
fprintf (stderr, " %s: n_ff = %u\n " , __func__, n_ff);
877
888
fprintf (stderr, " %s: n_parts = %zu\n " , __func__, ml->file_loaders .size ());
878
889
fprintf (stderr, " %s: model size = %s\n " , __func__, llama_model_type_name (model.type ));
@@ -1544,17 +1555,17 @@ static llama_vocab::id llama_sample_top_p_top_k(
1544
1555
// quantization
1545
1556
//
1546
1557
1547
- static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, int itype ) {
1558
+ static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype ) {
1548
1559
ggml_type quantized_type;
1549
- switch (itype ) {
1550
- case 2 : quantized_type = GGML_TYPE_Q4_0; break ;
1551
- case 3 : quantized_type = GGML_TYPE_Q4_1; break ;
1552
- default : throw format (" invalid quantization type %d\n " , itype );
1560
+ switch (ftype ) {
1561
+ case LLAMA_FTYPE_MOSTLY_Q4_0 : quantized_type = GGML_TYPE_Q4_0; break ;
1562
+ case LLAMA_FTYPE_MOSTLY_Q4_1 : quantized_type = GGML_TYPE_Q4_1; break ;
1563
+ default : throw format (" invalid output file type %d\n " , ftype );
1553
1564
};
1554
1565
1555
1566
std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp.c_str (), /* use_mmap*/ false ,
1556
1567
/* vocab_only*/ false ));
1557
- llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loaders .at (0 ).get (), ( uint32_t ) itype );
1568
+ llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loaders .at (0 ).get (), ftype );
1558
1569
1559
1570
size_t total_size_org = 0 ;
1560
1571
size_t total_size_new = 0 ;
@@ -1745,9 +1756,9 @@ void llama_free(struct llama_context * ctx) {
1745
1756
int llama_model_quantize (
1746
1757
const char * fname_inp,
1747
1758
const char * fname_out,
1748
- int itype ) {
1759
+ enum llama_ftype ftype ) {
1749
1760
try {
1750
- llama_model_quantize_internal (fname_inp, fname_out, itype );
1761
+ llama_model_quantize_internal (fname_inp, fname_out, ftype );
1751
1762
return 0 ;
1752
1763
} catch (const std::string & err) {
1753
1764
fprintf (stderr, " %s: failed to quantize: %s\n " , __func__, err.c_str ());
0 commit comments