From 3fec68be4e9577fc53158366d3b3af039c17bb1f Mon Sep 17 00:00:00 2001 From: Junyang Lin Date: Wed, 24 Apr 2024 15:16:21 +0800 Subject: [PATCH] convert : add support of codeqwen due to tokenizer (#6707) * add support of codeqwen due to tokenizer * override load_hparams * fix typo * fix load_params * convert : fix whitespace --------- Co-authored-by: Georgi Gerganov --- convert-hf-to-gguf.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4ace13eb63149..5763b6664e832 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -363,6 +363,16 @@ def _set_vocab_sentencepiece(self): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + print( + f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" + ) + for i in range(1, pad_count + 1): + tokens.append(f"[PAD{i}]") + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + assert len(tokens) == vocab_size self.gguf_writer.add_tokenizer_model("llama") @@ -1789,6 +1799,12 @@ def write_tensors(self): class Qwen2Model(Model): model_arch = gguf.MODEL_ARCH.QWEN2 + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model):