Skip to content

Commit 17ca832

Browse files
committed
Streamlining code and adding some more assertions
Important change: I'm classifying added tokens as control tokens now for BPE.
1 parent a4e9448 commit 17ca832

File tree

5 files changed

+42
-42
lines changed

5 files changed

+42
-42
lines changed

convert-falcon-hf-to-gguf.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -161,13 +161,9 @@ def parse_args() -> argparse.Namespace:
161161
byte_decoder = {v: k for k, v in byte_encoder.items()}
162162

163163
for i in range(vocab_size):
164-
text = reverse_vocab[i]
165-
tokens.append(text)
164+
tokens.append(reverse_vocab[i])
166165
scores.append(0.0) # dummy
167-
if text in byte_decoder:
168-
toktypes.append(gguf.TokenType.BYTE)
169-
else:
170-
toktypes.append(gguf.TokenType.NORMAL)
166+
toktypes.append(gguf.TokenType.NORMAL)
171167

172168
gguf_writer.add_token_list(tokens)
173169
gguf_writer.add_token_scores(scores)

convert.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -343,19 +343,13 @@ def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
343343
byte_encoder = tokenization_gpt2.bytes_to_unicode()
344344
byte_decoder = {v: k for k, v in byte_encoder.items()}
345345

346-
score = 0.0
347346
for i, _ in enumerate(tokenizer):
348-
text = reverse_vocab[i]
349-
if text in byte_decoder:
350-
toktype = gguf.TokenType.BYTE
351-
else:
352-
toktype = gguf.TokenType.NORMAL
353-
yield text, score, toktype
347+
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
354348

355349
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
356350
for text in self.added_tokens_list:
357351
score = -1000.0
358-
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
352+
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
359353

360354
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
361355
yield from self.bpe_tokens()

llama.cpp

+38-28
Original file line numberDiff line numberDiff line change
@@ -3884,6 +3884,10 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
38843884
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
38853885
}
38863886

3887+
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
3888+
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
3889+
}
3890+
38873891
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
38883892
GGML_ASSERT(llama_is_byte_token(vocab, id));
38893893
const auto& token_data = vocab.id_to_token.at(id);
@@ -7224,47 +7228,53 @@ static std::string llama_decode_text(const std::string& text) {
72247228
// does not write null-terminator to buf
72257229
int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
72267230
if (0 <= token && token < llama_model_n_vocab(model)) {
7227-
if (llama_is_normal_token(model->vocab, token)) {
7228-
std::string result = model->vocab.id_to_token[token].text;
7229-
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
7231+
switch (llama_vocab_get_type(model->vocab)) {
7232+
case LLAMA_VOCAB_TYPE_SPM: {
7233+
if (llama_is_normal_token(model->vocab, token)) {
7234+
std::string result = model->vocab.id_to_token[token].text;
72307235
llama_unescape_whitespace(result);
7231-
} else if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_BPE) {
7232-
result = llama_decode_text(result);
7233-
} else {
7234-
GGML_ASSERT(false);
7235-
}
7236-
if (length < (int) result.length()) {
7237-
return -result.length();
7238-
}
7239-
memcpy(buf, result.c_str(), result.length());
7240-
return result.length();
7241-
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
7242-
if (length < 3) {
7243-
return -3;
7244-
}
7245-
buf[0] = '\xe2';
7246-
buf[1] = '\x96';
7247-
buf[2] = '\x85';
7248-
return 3;
7249-
} else if (llama_is_control_token(model->vocab, token)) {
7250-
;
7251-
} else if (llama_is_byte_token(model->vocab, token)) {
7252-
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
7236+
if (length < (int) result.length()) {
7237+
return -result.length();
7238+
}
7239+
memcpy(buf, result.c_str(), result.length());
7240+
return result.length();
7241+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
7242+
if (length < 3) {
7243+
return -3;
7244+
}
7245+
memcpy(buf, "\xe2\x96\x85", 3);
7246+
return 3;
7247+
} else if (llama_is_control_token(model->vocab, token)) {
7248+
;
7249+
} else if (llama_is_byte_token(model->vocab, token)) {
72537250
if (length < 1) {
72547251
return -1;
72557252
}
72567253
buf[0] = llama_token_to_byte(model->vocab, token);
72577254
return 1;
7258-
} else if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_BPE) {
7259-
std::string result = llama_decode_text(model->vocab.id_to_token[token].text);
7260-
if (length < (int)result.length()) {
7255+
} else {
7256+
GGML_ASSERT(false);
7257+
}
7258+
break;
7259+
}
7260+
case LLAMA_VOCAB_TYPE_BPE: {
7261+
if (llama_is_normal_token(model->vocab, token)) {
7262+
std::string result = model->vocab.id_to_token[token].text;
7263+
result = llama_decode_text(result);
7264+
if (length < (int) result.length()) {
72617265
return -result.length();
72627266
}
72637267
memcpy(buf, result.c_str(), result.length());
72647268
return result.length();
7269+
} else if (llama_is_control_token(model->vocab, token)) {
7270+
;
72657271
} else {
72667272
GGML_ASSERT(false);
72677273
}
7274+
break;
7275+
}
7276+
default:
7277+
GGML_ASSERT(false);
72687278
}
72697279
}
72707280
return 0;

models/ggml-vocab-aquila.gguf

0 Bytes
Binary file not shown.

models/ggml-vocab-falcon.gguf

0 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)