Skip to content

Commit

Permalink
gguf : deprecate old FIM token KVs
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Oct 11, 2024
1 parent 3ae8670 commit 3681540
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 14 deletions.
26 changes: 21 additions & 5 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ class Tokenizer:
MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id"
EOS_ID = "tokenizer.ggml.eos_token_id"
EOT_ID = "tokenizer.ggml.eot_token_id"
EOM_ID = "tokenizer.ggml.eom_token_id"
UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id"
Expand All @@ -168,11 +170,16 @@ class Tokenizer:
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
CHAT_TEMPLATES = "tokenizer.chat_templates"
# FIM/Infill special tokens constants
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
# deprecated:
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
EOT_ID = "tokenizer.ggml.eot_token_id"
EOM_ID = "tokenizer.ggml.eom_token_id"

class Adapter:
TYPE = "adapter.type"
Expand Down Expand Up @@ -1579,15 +1586,24 @@ def get_type(val: Any) -> GGUFValueType:
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID

KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID

# deprecated
KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
9 changes: 0 additions & 9 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,15 +843,6 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:

self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)

def add_prefix_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)

def add_suffix_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)

def add_middle_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)

def add_eot_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.EOT_ID, id)

Expand Down
15 changes: 15 additions & 0 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,11 @@ enum llm_kv {

LLM_KV_ADAPTER_TYPE,
LLM_KV_ADAPTER_LORA_ALPHA,

// deprecated:
LLM_KV_TOKENIZER_PREFIX_ID,
LLM_KV_TOKENIZER_SUFFIX_ID,
LLM_KV_TOKENIZER_MIDDLE_ID,
};

static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
Expand Down Expand Up @@ -479,6 +484,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {

{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },

// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
};

struct LLM_KV {
Expand Down Expand Up @@ -6533,6 +6543,11 @@ static void llm_load_vocab(
{ LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
{ LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },

// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
};

for (const auto & it : special_token_types) {
Expand Down

0 comments on commit 3681540

Please sign in to comment.