nomic-ai · cgivre · Nov 11, 2024 · Nov 11, 2024 · Nov 25, 2024 · Dec 4, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -8,6 +8,10 @@ workflows:
   generate-config:
     jobs:
       - path-filtering/filter:
+          filters:
+            tags:
+              only:
+                - /.*/
           base-revision: main
           config-path: .circleci/continue_config.yml
           mapping: |

diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml
diff --git a/.gitmodules b/.gitmodules
@@ -17,3 +17,9 @@
 [submodule "gpt4all-chat/deps/QXlsx"]
 	path = gpt4all-chat/deps/QXlsx
 	url = https://github.com/nomic-ai/QXlsx.git
+[submodule "gpt4all-chat/deps/minja"]
+	path = gpt4all-chat/deps/minja
+	url = https://github.com/nomic-ai/minja.git
+[submodule "gpt4all-chat/deps/json"]
+	path = gpt4all-chat/deps/json
+	url = https://github.com/nlohmann/json.git
diff --git a/MAINTAINERS.md b/MAINTAINERS.md
@@ -51,11 +51,6 @@ Thiago Ramos ([@thiagojramos](https://github.com/thiagojramos))<br/>
 E-mail: thiagojramos@outlook.com<br/>
 - pt\_BR translation
 
-Victor Emanuel ([@SINAPSA-IC](https://github.com/SINAPSA-IC))<br/>
-E-mail: contact@sinapsaro.ro<br/>
-Discord: `@sinapsa_ic_56124_99632`
-- ro\_RO translation
-
 不知火 Shiranui ([@supersonictw](https://github.com/supersonictw))<br/>
 E-mail: supersonic@livemail.tw<br/>
 Discord: `@supersonictw`

diff --git a/README.md b/README.md
@@ -1,5 +1,9 @@
 <h1 align="center">GPT4All</h1>
 
+<p align="center">
+  Now with support for DeepSeek R1 Distillations
+</p>
+
 <p align="center">
   <a href="https://www.nomic.ai/gpt4all">Website</a> &bull; <a href="https://docs.gpt4all.io">Documentation</a> &bull; <a href="https://discord.gg/mGZE39AS3e">Discord</a> &bull; <a href="https://www.youtube.com/watch?v=gQcZDXRVJok">YouTube Tutorial</a>
 </p>
@@ -23,9 +27,6 @@ https://github.com/nomic-ai/gpt4all/assets/70534565/513a0f15-4964-4109-89e4-4f9a
 <p align="center">
 GPT4All is made possible by our compute partner <a href="https://www.paperspace.com/">Paperspace</a>.
 </p>
-<p align="center">
- <a href="https://www.phorm.ai/query?projectId=755eecd3-24ad-49cc-abf4-0ab84caacf63"><img src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg" alt="phorm.ai"></a>
-</p>
 
 ## Download Links
 

diff --git a/common/common.cmake b/common/common.cmake
@@ -11,7 +11,6 @@ function(gpt4all_add_warning_options target)
         -Wextra-semi
         -Wformat=2
         -Wmissing-include-dirs
-        -Wstrict-overflow=2
         -Wsuggest-override
         -Wvla
         # errors

diff --git a/gpt4all-backend/deps/llama.cpp-mainline b/gpt4all-backend/deps/llama.cpp-mainline
diff --git a/gpt4all-backend/include/gpt4all-backend/llmodel.h b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@@ -5,6 +5,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <expected>
 #include <functional>
 #include <optional>
 #include <span>
@@ -24,6 +25,10 @@ using namespace std::string_literals;
 class LLModel {
 public:
     using Token = int32_t;
+    using PromptCallback      = std::function<bool(std::span<const Token> batch, bool cached)>;
+    using ResponseCallback    = std::function<bool(Token token, std::string_view piece)>;
+    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
+    using ProgressCallback    = std::function<bool(float progress)>;
 
     class BadArchError: public std::runtime_error {
     public:
@@ -101,6 +106,7 @@ class LLModel {
         static int32_t maxContextLength(const std::string &modelPath);
         static int32_t layerCount(const std::string &modelPath);
         static bool isEmbeddingModel(const std::string &modelPath);
+        static auto chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>;
         static void setImplementationsSearchPath(const std::string &path);
         static const std::string &implementationsSearchPath();
         static bool hasSupportedCPU();
@@ -124,7 +130,6 @@ class LLModel {
     };
 
     struct PromptContext {
-        int32_t n_past = 0;             // number of tokens in past conversation
         int32_t n_predict = 200;
         int32_t top_k = 40;
         float   top_p = 0.9f;
@@ -136,8 +141,6 @@ class LLModel {
         float   contextErase = 0.5f;    // percent of context to erase if we exceed the context window
     };
 
-    using ProgressCallback = std::function<bool(float progress)>;
-
     explicit LLModel() {}
     virtual ~LLModel() {}
 
@@ -154,16 +157,12 @@ class LLModel {
 
     // This method requires the model to return true from supportsCompletion otherwise it will throw
     // an error
-    virtual void prompt(const std::string &prompt,
-                        const std::string &promptTemplate,
-                        std::function<bool(int32_t)> promptCallback,
-                        std::function<bool(int32_t, const std::string&)> responseCallback,
-                        bool allowContextShift,
-                        PromptContext &ctx,
-                        bool special = false,
-                        std::optional<std::string_view> fakeReply = {});
+    virtual void prompt(std::string_view        prompt,
+                        const PromptCallback   &promptCallback,
+                        const ResponseCallback &responseCallback,
+                        const PromptContext    &ctx);
 
-    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
+    virtual int32_t countPromptTokens(std::string_view prompt) const;
 
     virtual size_t embeddingSize() const {
         throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
@@ -209,23 +208,22 @@ class LLModel {
     void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
 
     virtual int32_t contextLength() const = 0;
+    virtual auto specialTokens() -> std::unordered_map<std::string, std::string> const = 0;
 
 protected:
     // These are pure virtual because subclasses need to implement as the default implementation of
     // 'prompt' above calls these functions
-    virtual std::vector<Token> tokenize(std::string_view str, bool special = false) = 0;
+    virtual std::vector<Token> tokenize(std::string_view str) const = 0;
     virtual bool isSpecialToken(Token id) const = 0;
     virtual std::string tokenToString(Token id) const = 0;
-    virtual void initSampler(PromptContext &ctx) = 0;
+    virtual void initSampler(const PromptContext &ctx) = 0;
     virtual Token sampleToken() const = 0;
-    virtual bool evalTokens(PromptContext &ctx, std::span<const Token> tokens) const = 0;
-    virtual void shiftContext(PromptContext &promptCtx) = 0;
+    virtual bool evalTokens(int32_t nPast, std::span<const Token> tokens) const = 0;
+    virtual void shiftContext(const PromptContext &promptCtx, int32_t *nPast) = 0;
     virtual int32_t inputLength() const = 0;
-    virtual void setTokenizeInputPosition(int32_t pos) = 0;
-    virtual auto computeModelInputPosition(PromptContext &ctx, const std::vector<Token> &input)
-        -> std::vector<Token>::const_iterator = 0;
-    virtual void setModelInputPosition(PromptContext &ctx, int32_t pos) = 0;
-    virtual void appendInputToken(PromptContext &ctx, Token tok) = 0;
+    virtual int32_t computeModelInputPosition(std::span<const Token> input) const = 0;
+    virtual void setModelInputPosition(int32_t pos) = 0;
+    virtual void appendInputToken(Token tok) = 0;
     virtual std::span<const Token> inputTokens() const = 0;
     virtual const std::vector<Token> &endTokens() const = 0;
     virtual bool shouldAddBOS() const = 0;
@@ -242,6 +240,12 @@ class LLModel {
         return -1;
     }
 
+    virtual auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
+    {
+        (void)modelPath;
+        return std::unexpected("not implemented");
+    }
+
     const Implementation *m_implementation = nullptr;
 
     ProgressCallback m_progressCallback;
@@ -253,19 +257,15 @@ class LLModel {
         return true;
     }
 
-    bool decodePrompt(std::function<bool(int32_t)> promptCallback,
-                      std::function<bool(int32_t, const std::string&)> responseCallback,
-                      bool allowContextShift,
-                      PromptContext &promptCtx,
-                      std::vector<Token> embd_inp,
-                      bool isResponse = false,
-                      bool alwaysDecode = false);
-    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
-                          bool allowContextShift,
-                          PromptContext &promptCtx);
-
-protected:
-    Token m_tokenize_last_token = -1; // not serialized
+    // prefill context with prompt
+    auto decodePrompt(const PromptCallback &promptCallback,
+                      const PromptContext  &promptCtx,
+                      std::vector<Token>    embd_inp)
+        -> std::optional<int32_t>;
+    // generate a response
+    void generateResponse(const ResponseCallback &responseCallback,
+                          const PromptContext    &promptCtx,
+                          int32_t                 nPast);
 
     friend class LLMImplementation;
 };

diff --git a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
@@ -35,16 +35,15 @@ typedef int32_t token_t;
  * behavior.
  */
 struct llmodel_prompt_context {
-    int32_t n_past;         // number of tokens in past conversation
     int32_t n_predict;      // number of tokens to predict
     int32_t top_k;          // top k logits to sample from
-    float top_p;            // nucleus sampling probability threshold
-    float min_p;            // Min P sampling
-    float temp;             // temperature to adjust model's output distribution
+    float   top_p;          // nucleus sampling probability threshold
+    float   min_p;          // Min P sampling
+    float   temp;           // temperature to adjust model's output distribution
     int32_t n_batch;        // number of predictions to generate in parallel
-    float repeat_penalty;   // penalty factor for repeated tokens
+    float   repeat_penalty; // penalty factor for repeated tokens
     int32_t repeat_last_n;  // last n tokens to penalize
-    float context_erase;    // percent of context to erase if we exceed the context window
+    float   context_erase;  // percent of context to erase if we exceed the context window
 };
 
 struct llmodel_gpu_device {
@@ -63,18 +62,20 @@ typedef struct llmodel_gpu_device llmodel_gpu_device;
 
 /**
  * Callback type for prompt processing.
- * @param token_id The token id of the prompt.
+ * @param token_ids An array of token ids of the prompt.
+ * @param n_token_ids The number of tokens in the array.
+ * @param cached Whether the tokens were already in cache.
  * @return a bool indicating whether the model should keep processing.
  */
-typedef bool (*llmodel_prompt_callback)(int32_t token_id);
+typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token_ids, bool cached);
 
 /**
  * Callback type for response.
  * @param token_id The token id of the response.
  * @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
  * @return a bool indicating whether the model should keep generating.
  */
-typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response);
+typedef bool (*llmodel_response_callback)(token_t token_id, const char *response);
 
 /**
  * Embedding cancellation callback for use with llmodel_embed.
@@ -85,6 +86,8 @@ typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response
  */
 typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
 
+typedef void (*llmodel_special_token_callback)(const char *name, const char *token);
+
 /**
  * Create a llmodel instance.
  * Recognises correct model type from file at model_path
@@ -183,22 +186,17 @@ uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint6
  * Generate a response using the model.
  * @param model A pointer to the llmodel_model instance.
  * @param prompt A string representing the input prompt.
- * @param prompt_template A string representing the input prompt template.
  * @param prompt_callback A callback function for handling the processing of prompt.
  * @param response_callback A callback function for handling the generated response.
- * @param allow_context_shift Whether to allow shifting of context to make room for more input.
- * @param special True if special tokens in the prompt should be processed, false otherwise.
- * @param fake_reply A string to insert into context as the model's reply, or NULL to generate one.
  * @param ctx A pointer to the llmodel_prompt_context structure.
+ * @param error A pointer to a string; will only be set on error.
  */
-void llmodel_prompt(llmodel_model model, const char *prompt,
-                    const char *prompt_template,
-                    llmodel_prompt_callback prompt_callback,
-                    llmodel_response_callback response_callback,
-                    bool allow_context_shift,
-                    llmodel_prompt_context *ctx,
-                    bool special,
-                    const char *fake_reply);
+bool llmodel_prompt(llmodel_model               model,
+                    const char                 *prompt,
+                    llmodel_prompt_callback     prompt_callback,
+                    llmodel_response_callback   response_callback,
+                    llmodel_prompt_context     *ctx,
+                    const char                **error);
 
 /**
  * Generate an embedding using the model.
@@ -310,6 +308,10 @@ const char *llmodel_model_backend_name(llmodel_model model);
  */
 const char *llmodel_model_gpu_device_name(llmodel_model model);
 
+int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error);
+
+void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback);
+
 #ifdef __cplusplus
 }
 #endif
+3 −3		.devops/full-rocm.Dockerfile
+3 −3		.devops/llama-cli-rocm.Dockerfile
+3 −3		.devops/llama-server-rocm.Dockerfile
+1 −1		.github/workflows/build.yml
+3 −1		.github/workflows/python-type-check.yml
+8 −3		CMakeLists.txt
+5 −0		CONTRIBUTING.md
+1 −12		Makefile
+5 −2		README.md
+76 −9		ci/run.sh
+15 −3		common/arg.cpp
+7 −0		common/common.cpp
+1 −0		common/common.h
+3 −0		common/console.cpp
+102 −31		convert_hf_to_gguf.py
+2 −0		convert_hf_to_gguf_update.py
+4 −1		convert_lora_to_gguf.py
+73 −19		docs/backend/SYCL.md
+0 −1		examples/CMakeLists.txt
+0 −6		examples/benchmark/CMakeLists.txt
+0 −275		examples/benchmark/benchmark-matmult.cpp
+7 −7		examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+0 −7		examples/cvector-generator/pca.hpp
+6 −1		examples/embedding/embedding.cpp
+49 −37		examples/gguf-split/gguf-split.cpp
+0 −6		examples/llava/clip.cpp
+2 −2		examples/llava/convert_image_encoder_to_gguf.py
+38 −1		examples/server/README.md
+199 −16		examples/server/server.cpp
+1 −1		examples/server/tests/features/embeddings.feature
+42 −0		examples/server/tests/features/rerank.feature
+53 −1		examples/server/tests/features/steps/steps.py
+1 −1		examples/server/tests/requirements.txt
+24 −1		examples/server/utils.hpp
+3 −3		flake.lock
+0 −5		ggml/include/ggml-metal.h
+36 −30		ggml/include/ggml.h
+17 −2		ggml/src/CMakeLists.txt
+2 −11		ggml/src/ggml-aarch64.c
+0 −1		ggml/src/ggml-cuda/im2col.cu
+1,899 −1,787		ggml/src/ggml-metal.m
+2 −2		ggml/src/ggml-quants.c
+0 −4		ggml/src/ggml-quants.h
+207 −209		ggml/src/ggml-vulkan.cpp
+533 −1,037		ggml/src/ggml.c
+4 −6		ggml/src/vulkan-shaders/argsort.comp
+30 −0		gguf-py/gguf/constants.py
+3 −0		gguf-py/gguf/gguf_writer.py
+14 −2		gguf-py/gguf/tensor_mapping.py
+9 −4		include/llama.h
+112 −0		models/ggml-vocab-chameleon.gguf.inp
+46 −0		models/ggml-vocab-chameleon.gguf.out
+2 −1		pyrightconfig.json
+1 −1		requirements/requirements-convert_legacy_llama.txt
+1 −1		scripts/sync-ggml.last
+191 −108		src/llama-vocab.cpp
+9 −0		src/llama-vocab.h
+370 −29		src/llama.cpp
+6 −4		src/unicode-data.cpp
+4 −4		src/unicode-data.h
+14 −7		src/unicode.cpp
+240 −217		tests/test-backend-ops.cpp
+9 −5		tests/test-grad0.cpp
+55 −35		tests/test-tokenizer-0.cpp