Skip to content

Commit

Permalink
llama.cpp : fix LF token
Browse files Browse the repository at this point in the history
  • Loading branch information
klosax authored Aug 26, 2023
1 parent 6459cab commit d9a9b09
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1636,6 +1636,7 @@ static void llm_load_hparams(

// TODO: This should probably be in llama.h
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);

static void llm_load_vocab(
llama_model_loader & ml,
Expand Down Expand Up @@ -1737,7 +1738,11 @@ static void llm_load_vocab(
}

// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
} else {
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
}

// special tokens
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
Expand Down

0 comments on commit d9a9b09

Please sign in to comment.