apicalshark · apicalshark · Nov 8, 2024 · Nov 1, 2024 · Nov 2, 2024 · Nov 4, 2024
diff --git a/.editorconfig b/.editorconfig
@@ -24,6 +24,16 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 
+[examples/server/public/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
+[examples/server/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
 

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -63,6 +63,14 @@ env:
 
 jobs:
 
+  # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  #       how to debug it.
+  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
+
+  # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  #       how to debug it.
+  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
+  #       would be great if we fix these
 
     # CUDA Release
 
@@ -232,6 +240,7 @@ jobs:
 
     release:
         permissions: write-all
+
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
 
         runs-on: ubuntu-latest

diff --git a/Makefile b/Makefile
@@ -1455,22 +1455,13 @@ llama-server: \
 	examples/server/server.cpp \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
-	examples/server/colorthemes.css.hpp \
-	examples/server/style.css.hpp \
-	examples/server/theme-beeninorder.css.hpp \
-	examples/server/theme-ketivah.css.hpp \
-	examples/server/theme-mangotango.css.hpp \
-	examples/server/theme-playground.css.hpp \
-	examples/server/theme-polarnight.css.hpp \
-	examples/server/theme-snowstorm.css.hpp \
 	examples/server/index.html.hpp \
-	examples/server/index-new.html.hpp \
-	examples/server/index.js.hpp \
 	examples/server/completion.js.hpp \
-	examples/server/system-prompts.js.hpp \
-	examples/server/prompt-formats.js.hpp \
-	examples/server/json-schema-to-grammar.mjs.hpp \
 	examples/server/loading.html.hpp \
+	examples/server/deps_daisyui.min.css.hpp \
+	examples/server/deps_markdown-it.js.hpp \
+	examples/server/deps_tailwindcss.js.hpp \
+	examples/server/deps_vue.esm-browser.js.hpp \
 	common/json.hpp \
 	common/stb_image.h \
 	$(OBJ_ALL)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -3748,10 +3748,7 @@ def __init__(self, *args, **kwargs):
 
         # Embeddings scale
         self.embeddings_scale = 1.0
-        # note: For some JAIS flavors, output is tied to (same as) wte in original model
-        self.output_is_wte = False
         if 'mup_embeddings_scale' in self.hparams:
-            self.output_is_wte = True   # Hack (?)
             self.embeddings_scale = self.hparams['mup_embeddings_scale']
         elif 'embeddings_scale' in self.hparams:
             self.embeddings_scale = self.hparams['embeddings_scale']
@@ -3808,10 +3805,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
             tensors.append((new_name, data_torch * self.embeddings_scale))
-            if self.output_is_wte:
-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
         elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
-            assert not self.output_is_wte
             tensors.append((new_name, data_torch * self.width_scale))
         else:
             tensors.append((new_name, data_torch))

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
@@ -377,7 +377,7 @@ found 2 SYCL devices:
 
 |Chosen Device ID|Setting|
 |-|-|
-|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
+|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:0"` or no action|
 |1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
 |0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
 

diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
@@ -15,22 +15,13 @@ set(TARGET_SRCS
     httplib.h
 )
 set(PUBLIC_ASSETS
-    colorthemes.css
-    style.css
-    theme-beeninorder.css
-    theme-ketivah.css
-    theme-mangotango.css
-    theme-playground.css
-    theme-polarnight.css
-    theme-snowstorm.css
     index.html
-    index-new.html
-    index.js
     completion.js
-    system-prompts.js
-    prompt-formats.js
-    json-schema-to-grammar.mjs
     loading.html
+    deps_daisyui.min.css
+    deps_markdown-it.js
+    deps_tailwindcss.js
+    deps_vue.esm-browser.js
 )
 
 foreach(asset ${PUBLIC_ASSETS})

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -928,6 +928,16 @@ Apart from error types supported by OAI, we also have custom types that are spec
 }
 ```
 
+### Legacy completion web UI
+
+A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
+
+For example:
+
+```sh
+./llama-server -m my_model.gguf -c 8192 --path ./examples/server/public_legacy
+```
+
 ### Extending or building alternative Web Front End
 
 You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.

diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs
@@ -1,7 +1,7 @@
 import * as readline from 'node:readline'
 import { stdin, stdout } from 'node:process'
 import { readFileSync } from 'node:fs'
-import { SchemaConverter }  from './public/json-schema-to-grammar.mjs'
+import { SchemaConverter }  from './public_legacy/json-schema-to-grammar.mjs'
 
 const args = process.argv.slice(2);
 const grammarJsonSchemaFile = args.find(

diff --git a/examples/server/deps.sh b/examples/server/deps.sh
@@ -6,5 +6,20 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 PUBLIC=$DIR/public
 
 echo "download js bundle files"
-curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
-echo >> $PUBLIC/index.js # add newline
+
+# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
+
+curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
+echo >> $PUBLIC/deps_tailwindcss.js # add newline
+
+curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
+curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
+echo >> $PUBLIC/deps_daisyui.min.css # add newline
+
+curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
+echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
+
+curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
+echo >> $PUBLIC/deps_markdown-it.js # add newline
+
+ls -lah $PUBLIC
diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
@@ -1,12 +1,16 @@
 const paramDefaults = {
   stream: true,
-  n_predict: 500,
   temperature: 0.2,
-  stop: ["</s>"]
 };
 
 let generation_settings = null;
 
+export class CompletionError extends Error {
+  constructor(message, name, data) {
+    super(message);
+    this.name = name;
+  }
+};
 
 // Completes the prompt as a generator. Recommended for most use cases.
 //
@@ -29,7 +33,7 @@ export async function* llama(prompt, params = {}, config = {}) {
 
   const completionParams = { ...paramDefaults, ...params, prompt };
 
-  const response = await fetch(`${api_url}/completion`, {
+  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
     method: 'POST',
     body: JSON.stringify(completionParams),
     headers: {
@@ -41,6 +45,18 @@ export async function* llama(prompt, params = {}, config = {}) {
     signal: controller.signal,
   });
 
+  const status = response.status;
+  if (status !== 200) {
+    try {
+      const body = await response.json();
+      if (body && body.error && body.error.message) {
+        throw new CompletionError(body.error.message, 'ServerError');
+      }
+    } catch (err) {
+      throw new CompletionError(err.message, 'ServerError');
+    }
+  }
+
   const reader = response.body.getReader();
   const decoder = new TextDecoder();
 
@@ -78,7 +94,12 @@ export async function* llama(prompt, params = {}, config = {}) {
       for (const line of lines) {
         const match = regex.exec(line);
         if (match) {
-          result[match[1]] = match[2]
+          result[match[1]] = match[2];
+          if (result.data === '[DONE]') {
+            cont = false;
+            break;
+          }
+
           // since we know this is llama.cpp, let's just decode the json in data
           if (result.data) {
             result.data = JSON.parse(result.data);

diff --git a/examples/server/public/deps_daisyui.min.css b/examples/server/public/deps_daisyui.min.css