@@ -47,6 +47,9 @@ def _get_model_architecture(self):
4747 return gguf .MODEL_ARCH .MPT
4848 if arch in ("BaichuanForCausalLM" , "BaiChuanForCausalLM" ):
4949 return gguf .MODEL_ARCH .BAICHUAN
50+ if arch == "FalconForCausalLM" :
51+ return gguf .MODEL_ARCH .FALCON
52+
5053 raise NotImplementedError (f'Architecture "{ arch } " not supported!' )
5154
5255 def set_vocab (self ):
@@ -180,6 +183,8 @@ def from_model_architecture(model_architecture):
180183 return MPTModel
181184 if model_architecture in ("BaichuanForCausalLM" , "BaiChuanForCausalLM" ):
182185 return BaichuanModel
186+ if model_architecture == "FalconForCausalLM" :
187+ return FalconModel
183188 return Model
184189
185190class StableLMModel (Model ):
@@ -537,3 +542,96 @@ def write_tensors(self):
537542 print (name + " -> " + new_name + ", n_dims = " + str (n_dims ) + ", " + str (old_dtype ) + " --> " + str (data .dtype ))
538543 self .gguf_writer .add_tensor (new_name , data )
539544
545+
546+ class FalconModel (Model ):
547+ def set_gguf_parameters (self ):
548+ block_count = self .hparams .get ("num_hidden_layers" )
549+ if block_count is None :
550+ block_count = self .hparams ["n_layer" ] # old name
551+
552+ n_head = self .hparams .get ("num_attention_heads" )
553+ if n_head is None :
554+ n_head = self .hparams ["n_head" ] # old name
555+
556+ n_head_kv = self .hparams .get ("num_kv_heads" )
557+ if n_head_kv is None :
558+ n_head_kv = self .hparams .get ("n_head_kv" , 1 ) # old name
559+
560+ self .gguf_writer .add_name ("Falcon" )
561+ self .gguf_writer .add_context_length (2048 ) # not in config.json
562+ self .gguf_writer .add_tensor_data_layout ("jploski" ) # qkv tensor transform
563+ self .gguf_writer .add_embedding_length (self .hparams ["hidden_size" ])
564+ self .gguf_writer .add_feed_forward_length (4 * self .hparams ["hidden_size" ])
565+ self .gguf_writer .add_block_count (block_count )
566+ self .gguf_writer .add_head_count (n_head )
567+ self .gguf_writer .add_head_count_kv (n_head_kv )
568+ self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
569+ self .gguf_writer .add_file_type (self .ftype )
570+
571+ def write_tensors (self ):
572+ block_count = self .hparams .get ("num_hidden_layers" )
573+ if block_count is None :
574+ block_count = self .hparams ["n_layer" ] # old name
575+
576+ n_head = self .hparams .get ("num_attention_heads" )
577+ if n_head is None :
578+ n_head = self .hparams ["n_head" ] # old name
579+
580+ n_head_kv = self .hparams .get ("num_kv_heads" )
581+ if n_head_kv is None :
582+ n_head_kv = self .hparams .get ("n_head_kv" , 1 ) # old name
583+
584+ head_dim = self .hparams ["hidden_size" ] // n_head
585+ tensor_map = gguf .get_tensor_name_map (self .model_arch , block_count )
586+
587+ for name , data in self .get_tensors ():
588+ old_dtype = data .dtype
589+
590+ # convert any unsupported data types to float32
591+ if data .dtype != torch .float16 and data .dtype != torch .float32 :
592+ data = data .to (torch .float32 )
593+
594+ # QKV tensor transform
595+ # The original query_key_value tensor contains n_head_kv "kv groups",
596+ # each consisting of n_head/n_head_kv query weights followed by one key
597+ # and one value weight (shared by all query heads in the kv group).
598+ # This layout makes it a big pain to work with in GGML.
599+ # So we rearrange them here,, so that we have n_head query weights
600+ # followed by n_head_kv key weights followed by n_head_kv value weights,
601+ # in contiguous fashion.
602+ # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
603+
604+ if "query_key_value" in name :
605+ qkv = data .view (n_head_kv , n_head // n_head_kv + 2 , head_dim , head_dim * n_head )
606+ q = qkv [:, :- 2 ].reshape (n_head * head_dim , head_dim * n_head )
607+ k = qkv [:, [- 2 ]].reshape (n_head_kv * head_dim , head_dim * n_head )
608+ v = qkv [:, [- 1 ]].reshape (n_head_kv * head_dim , head_dim * n_head )
609+ data = torch .cat ((q ,k ,v )).reshape_as (data )
610+
611+ data = data .squeeze ().numpy ()
612+
613+ # map tensor names
614+ new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
615+ if new_name is None :
616+ print ("Can not map tensor '" + name + "'" )
617+ sys .exit ()
618+
619+ n_dims = len (data .shape )
620+ data_dtype = data .dtype
621+
622+ # if f32 desired, convert any float16 to float32
623+ if self .ftype == 0 and data_dtype == np .float16 :
624+ data = data .astype (np .float32 )
625+
626+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
627+ if self .ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
628+ data = data .astype (np .float32 )
629+
630+ # if f16 desired, convert any float32 2-dim weight tensors to float16
631+ if self .ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
632+ data = data .astype (np .float16 )
633+
634+ print (new_name + ", n_dims = " + str (n_dims ) + ", " + str (old_dtype ) + " --> " + str (data .dtype ))
635+
636+ self .gguf_writer .add_tensor (new_name , data )
637+
0 commit comments