From d9a9b093f9a2da5a9b320901912fa527691eed1d Mon Sep 17 00:00:00 2001 From: klosax <131523366+klosax@users.noreply.github.com> Date: Sat, 26 Aug 2023 23:03:01 +0200 Subject: [PATCH] llama.cpp : fix LF token --- llama.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index a9a2b4d5c5f506..83a6ca8482f1e7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1636,6 +1636,7 @@ static void llm_load_hparams( // TODO: This should probably be in llama.h static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos); +static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch); static void llm_load_vocab( llama_model_loader & ml, @@ -1737,7 +1738,11 @@ static void llm_load_vocab( } // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' - vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0]; + if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { + vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + } else { + vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0]; + } // special tokens GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));