ggerganov · wizd · Mar 13, 2023 · Mar 13, 2023 · Mar 13, 2023 · Mar 13, 2023
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,4 @@ models/*
 
 arm_neon.h
 compile_commands.json
+*.dSYM/
diff --git a/Makefile b/Makefile
@@ -30,9 +30,9 @@ endif
 # Compile flags
 #
 
-CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
-LDFLAGS  =
+CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC -g -I/opt/homebrew/include
+CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -g -I/opt/homebrew/include
+LDFLAGS  = -L/opt/homebrew/lib -lsentencepiece
 
 # OS specific
 # TODO: support Windows

diff --git a/main.cpp b/main.cpp
@@ -10,6 +10,7 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <sentencepiece_processor.h>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
@@ -84,7 +85,7 @@ struct llama_model {
 };
 
 // load the model's weights from a file
-bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
+bool llama_model_load(const std::string & fname, llama_model & model, sentencepiece::SentencePieceProcessor & sp, gpt_vocab & vocab, int n_ctx) {
     printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     auto fin = std::ifstream(fname, std::ios::binary);
@@ -146,6 +147,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             return false;
         }
 
+        printf("total pieces: %d", sp.GetPieceSize());
+
         std::string word;
         for (int i = 0; i < n_vocab; i++) {
             uint32_t len;
@@ -154,8 +157,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             word.resize(len);
             fin.read((char *) word.data(), len);
 
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
+            std::string wordx = sp.IdToPiece(i);
+            vocab.token_to_id[wordx] = i;
+            vocab.id_to_token[i] = wordx;
 
             //if (i < 30000) {
             //    printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
@@ -767,11 +771,15 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
     params.model = "models/llama-7B/ggml-model.bin";
+    params.tokenizer = "models/tokenizer.model";
 
     if (gpt_params_parse(argc, argv, params) == false) {
         return 1;
     }
 
+    sentencepiece::SentencePieceProcessor sp;
+    sp.Load(params.tokenizer);
+
     if (params.seed < 0) {
         params.seed = time(NULL);
     }
@@ -795,7 +803,7 @@ int main(int argc, char ** argv) {
     {
         const int64_t t_start_us = ggml_time_us();
 
-        if (!llama_model_load(params.model, model, vocab, 512)) {  // TODO: set context from user input ??
+        if (!llama_model_load(params.model, model, sp, vocab, 512)) {  // TODO: set context from user input ??
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
             return 1;
         }
@@ -811,12 +819,12 @@ int main(int argc, char ** argv) {
     std::vector<float> logits;
 
     // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(sp, vocab, params.prompt, true);
 
     params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
 
     // tokenize the reverse prompt
-    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
+    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(sp, vocab, params.antiprompt, false);
 
     printf("\n");
     printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@@ -882,6 +890,8 @@ int main(int argc, char ** argv) {
         printf(ANSI_COLOR_YELLOW);
     }
 
+    // buffering UTF-8 tokens like <0xE6>,<0xAC><0xA2> spanning across multiple output to make it complete.
+    std::vector<gpt_vocab::id> buffids = {};
     while (remaining_tokens > 0) {
         // predict
         if (embd.size() > 0) {
@@ -943,9 +953,8 @@ int main(int argc, char ** argv) {
 
         // display text
         if (!input_noecho) {
-            for (auto id : embd) {
-                printf("%s", vocab.id_to_token[id].c_str());
-            }
+            untokenize(sp, buffids, embd);
+
             // reset color to default if we there is no pending user input
             if (params.use_color && embd_inp.size() <= input_consumed) {
                 printf(ANSI_COLOR_RESET);
@@ -986,7 +995,7 @@ int main(int argc, char ** argv) {
                         buf[n_read+1] = 0;
                     }
 
-                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
+                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(sp, vocab, buf, false);
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
                     remaining_tokens -= line_inp.size();

diff --git a/utils.cpp b/utils.cpp
@@ -4,10 +4,7 @@
 #include <cstring>
 #include <fstream>
 #include <regex>
-#include <iostream>
-#include <iterator>
-#include <string>
-#include <math.h>
+#include <sentencepiece_processor.h>
 
  #if defined(_MSC_VER) || defined(__MINGW32__)
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -49,6 +46,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.n_batch = std::stoi(argv[++i]);
         } else if (arg == "-m" || arg == "--model") {
             params.model = argv[++i];
+        } else if (arg == "--tokenizer") {
+            params.tokenizer = argv[++i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--interactive-start") {
@@ -96,6 +95,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  --tokenizer FNAME\n");
+    fprintf(stderr, "                        tokenizer path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "\n");
 }
 
@@ -272,42 +273,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
     return tokens;
 }
 
-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
-    //auto res = gpt_tokenize(vocab, text);
-
-    //if (bos) {
-    //    res.insert(res.begin(), 1); // TODO: replace with vocab.bos
-    //}
-
+std::vector<gpt_vocab::id> llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos) {
     std::vector<gpt_vocab::id> res;
 
-    if (bos) {
-        res.push_back(1); // TODO: replace with vocab.bos
-    }
-
-     //find the longest token that matches the text
-    int pos = 0;
-    while (true) {
-        int l = 0;
-        int t = 0;
-        for (const auto & kv : vocab.id_to_token) {
-            if (kv.second.size() < l) continue;
-            if (kv.second.size() > text.size() - pos) continue;
-            if (text.substr(pos, kv.second.size()) == kv.second) {
-                l = kv.second.size();
-                t = kv.first;
-            }
-        }
-
-        if (l == 0) {
-            break;
-        }
-
-        res.push_back(t);
-        pos += l;
-    }
-
-    return res;
+    std::vector<std::string> pieces;
+    return sp.EncodeAsIds(text);
 }
 
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
@@ -542,3 +512,39 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
 
     return (n/k)*row_size;
 }
+
+void untokenize(sentencepiece::SentencePieceProcessor &sp, std::vector<gpt_vocab::id> &buffids, std::vector<gpt_vocab::id> &embd)
+{
+    for (auto id : embd)
+    {
+        std::string s = sp.IdToPiece(id); // vocab.id_to_token[id];
+
+        if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
+        {
+            buffids.push_back(id);
+            std::string txt = sp.DecodeIds(buffids);
+            // printf("bufferring %s, total buffer: %s\n", s.c_str(), txt.c_str());
+        }
+        else if (s.find("▁") == 0)
+        {
+            if (!buffids.empty())
+            {
+                std::string txt = sp.DecodeIds(buffids);
+                printf("%s", txt.c_str());
+                buffids.clear();
+            }
+            s = std::regex_replace(s, std::regex("▁"), " ");
+            printf("%s", s.c_str());
+        }
+        else
+        {
+            if (!buffids.empty())
+            {
+                std::string txt = sp.DecodeIds(buffids);
+                printf("%s", txt.c_str());
+                buffids.clear();
+            }
+            printf("%s", s.c_str());
+        }
+    }
+}
diff --git a/utils.h b/utils.h
@@ -7,6 +7,7 @@
 #include <vector>
 #include <random>
 #include <thread>
+#include <sentencepiece_processor.h>
 
 //
 // CLI argument parsing
@@ -27,6 +28,7 @@ struct gpt_params {
     int32_t n_batch = 8; // batch size for prompt processing
 
     std::string model = "models/lamma-7B/ggml-model.bin"; // model path
+    std::string tokenizer = "models/tokenizer.model";   // tokenizer path
     std::string prompt;
 
     bool use_color = false; // use color to distinguish generations and inputs
@@ -73,7 +75,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 
 // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
 // ref: https://github.com/google/sentencepiece
-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
+std::vector<gpt_vocab::id> llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos);
 
 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
@@ -102,3 +104,6 @@ void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int
 
 size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
 size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
+
+void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & buffids, std::vector<gpt_vocab::id> & embd);
+