Skip to content

Commit 31cfbb1

Browse files
tobiggerganov
andauthored
Expose generation timings from server & update completions.js (ggml-org#2116)
* use javascript generators as much cleaner API Also add ways to access completion as promise and EventSource * export llama_timings as struct and expose them in server * update readme, update baked includes * llama : uniform variable names + struct init --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 983b555 commit 31cfbb1

File tree

9 files changed

+1926
-1368
lines changed

9 files changed

+1926
-1368
lines changed

examples/server/README.md

+20-17
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,17 @@ Command line options:
2626

2727
## Build
2828

29-
Build llama.cpp with server from repository root with either make or CMake.
29+
server is build alongside everything else from the root of the project
3030

3131
- Using `make`:
3232

3333
```bash
34-
LLAMA_BUILD_SERVER=1 make
34+
make
3535
```
3636

3737
- Using `CMake`:
3838

3939
```bash
40-
mkdir build-server
41-
cd build-server
42-
cmake -DLLAMA_BUILD_SERVER=ON ..
4340
cmake --build . --config Release
4441
```
4542

@@ -208,24 +205,30 @@ openai.api_base = "http://<Your api-server IP>:port"
208205

209206
Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
210207

211-
### Extending the Web Front End
208+
### Extending or building alternative Web Front End
212209

213-
The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method. A simple example is below:
210+
The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
214211

215-
```
212+
Read the documentation in `/completion.js` to see convenient ways to access llama.
213+
214+
A simple example is below:
215+
216+
```html
216217
<html>
217218
<body>
218219
<pre>
219220
<script type="module">
220-
import { llamaComplete } from '/completion.js'
221-
222-
llamaComplete({
223-
prompt: "### Instruction:\nWrite dad jokes, each one paragraph. You can use html formatting if needed.\n\n### Response:",
224-
n_predict: 1024,
225-
},
226-
null,
227-
(chunk) => document.write(chunk.data.content)
228-
)
221+
import { llama } from '/completion.js'
222+
223+
const prompt = `### Instruction:
224+
Write dad jokes, each one paragraph.
225+
You can use html formatting if needed.
226+
227+
### Response:`
228+
229+
for await (const chunk of llama(prompt)) {
230+
document.write(chunk.data.content)
231+
}
229232
</script>
230233
</pre>
231234
</body>

examples/server/completion.js.hpp

+365-183
Large diffs are not rendered by default.

examples/server/deps.sh

-4
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@
44
# get the directory of this script file
55
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
66
PUBLIC=$DIR/public
7-
OUTPUT=$DIR/templats.hpp
8-
9-
echo "// Generated file, do not edit" > $OUTPUT
10-
echo "" > $OUTPUT
117

128
echo "download js bundle files"
139
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js

examples/server/index.html.hpp

+816-763
Large diffs are not rendered by default.

examples/server/public/completion.js

+103-16
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,29 @@ const paramDefaults = {
55
stop: ["</s>"]
66
};
77

8-
/**
9-
* This function completes the input text using a llama dictionary.
10-
* @param {object} params - The parameters for the completion request.
11-
* @param {object} controller - an instance of AbortController if you need one, or null.
12-
* @param {function} callback - The callback function to call when the completion is done.
13-
* @returns {string} the completed text as a string. Ideally ignored, and you get at it via the callback.
14-
*/
15-
export const llamaComplete = async (params, controller, callback) => {
8+
let generation_settings = null;
9+
10+
11+
// Completes the prompt as a generator. Recommended for most use cases.
12+
//
13+
// Example:
14+
//
15+
// import { llama } from '/completion.js'
16+
//
17+
// const request = llama("Tell me a joke", {n_predict: 800})
18+
// for await (const chunk of request) {
19+
// document.write(chunk.data.content)
20+
// }
21+
//
22+
export async function* llama(prompt, params = {}, config = {}) {
23+
let controller = config.controller;
24+
1625
if (!controller) {
1726
controller = new AbortController();
1827
}
19-
const completionParams = { ...paramDefaults, ...params };
2028

21-
// we use fetch directly here becasue the built in fetchEventSource does not support POST
29+
const completionParams = { ...paramDefaults, ...params, prompt };
30+
2231
const response = await fetch("/completion", {
2332
method: 'POST',
2433
body: JSON.stringify(completionParams),
@@ -36,7 +45,6 @@ export const llamaComplete = async (params, controller, callback) => {
3645
let content = "";
3746

3847
try {
39-
4048
let cont = true;
4149

4250
while (cont) {
@@ -59,18 +67,21 @@ export const llamaComplete = async (params, controller, callback) => {
5967
result.data = JSON.parse(result.data);
6068
content += result.data.content;
6169

62-
// callack
63-
if (callback) {
64-
cont = callback(result) != false;
65-
}
70+
// yield
71+
yield result;
6672

6773
// if we got a stop token from server, we will break here
6874
if (result.data.stop) {
75+
if (result.data.generation_settings) {
76+
generation_settings = result.data.generation_settings;
77+
}
6978
break;
7079
}
7180
}
7281
} catch (e) {
73-
console.error("llama error: ", e);
82+
if (e.name !== 'AbortError') {
83+
console.error("llama error: ", e);
84+
}
7485
throw e;
7586
}
7687
finally {
@@ -79,3 +90,79 @@ export const llamaComplete = async (params, controller, callback) => {
7990

8091
return content;
8192
}
93+
94+
// Call llama, return an event target that you can subcribe to
95+
//
96+
// Example:
97+
//
98+
// import { llamaEventTarget } from '/completion.js'
99+
//
100+
// const conn = llamaEventTarget(prompt)
101+
// conn.addEventListener("message", (chunk) => {
102+
// document.write(chunk.detail.content)
103+
// })
104+
//
105+
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
106+
const eventTarget = new EventTarget();
107+
(async () => {
108+
let content = "";
109+
for await (const chunk of llama(prompt, params, config)) {
110+
if (chunk.data) {
111+
content += chunk.data.content;
112+
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
113+
}
114+
if (chunk.data.generation_settings) {
115+
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
116+
}
117+
if (chunk.data.timings) {
118+
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
119+
}
120+
}
121+
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
122+
})();
123+
return eventTarget;
124+
}
125+
126+
// Call llama, return a promise that resolves to the completed text. This does not support streaming
127+
//
128+
// Example:
129+
//
130+
// llamaPromise(prompt).then((content) => {
131+
// document.write(content)
132+
// })
133+
//
134+
// or
135+
//
136+
// const content = await llamaPromise(prompt)
137+
// document.write(content)
138+
//
139+
export const llamaPromise = (prompt, params = {}, config = {}) => {
140+
return new Promise(async (resolve, reject) => {
141+
let content = "";
142+
try {
143+
for await (const chunk of llama(prompt, params, config)) {
144+
content += chunk.data.content;
145+
}
146+
resolve(content);
147+
} catch (error) {
148+
reject(error);
149+
}
150+
});
151+
};
152+
153+
/**
154+
* (deprecated)
155+
*/
156+
export const llamaComplete = async (params, controller, callback) => {
157+
for await (const chunk of llama(params.prompt, params, { controller })) {
158+
callback(chunk);
159+
}
160+
}
161+
162+
// Get the model info from the server. This is useful for getting the context window and so on.
163+
export const llamaModelInfo = async () => {
164+
if (!generation_settings) {
165+
generation_settings = await fetch("/model.json").then(r => r.json());
166+
}
167+
return generation_settings;
168+
}

0 commit comments

Comments
 (0)