Skip to content

Expose generation timings from server & update completions.js #2116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 20 additions & 17 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,17 @@ Command line options:

## Build

Build llama.cpp with server from repository root with either make or CMake.
server is build alongside everything else from the root of the project

- Using `make`:

```bash
LLAMA_BUILD_SERVER=1 make
make
```

- Using `CMake`:

```bash
mkdir build-server
cd build-server
cmake -DLLAMA_BUILD_SERVER=ON ..
cmake --build . --config Release
```

Expand Down Expand Up @@ -208,24 +205,30 @@ openai.api_base = "http://<Your api-server IP>:port"

Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API

### Extending the Web Front End
### Extending or building alternative Web Front End

The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method. A simple example is below:
The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.

```
Read the documentation in `/completion.js` to see convenient ways to access llama.

A simple example is below:

```html
<html>
<body>
<pre>
<script type="module">
import { llamaComplete } from '/completion.js'

llamaComplete({
prompt: "### Instruction:\nWrite dad jokes, each one paragraph. You can use html formatting if needed.\n\n### Response:",
n_predict: 1024,
},
null,
(chunk) => document.write(chunk.data.content)
)
import { llama } from '/completion.js'

const prompt = `### Instruction:
Write dad jokes, each one paragraph.
You can use html formatting if needed.

### Response:`

for await (const chunk of llama(prompt)) {
document.write(chunk.data.content)
}
</script>
</pre>
</body>
Expand Down
548 changes: 365 additions & 183 deletions examples/server/completion.js.hpp

Large diffs are not rendered by default.

4 changes: 0 additions & 4 deletions examples/server/deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
# get the directory of this script file
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
PUBLIC=$DIR/public
OUTPUT=$DIR/templats.hpp

echo "// Generated file, do not edit" > $OUTPUT
echo "" > $OUTPUT

echo "download js bundle files"
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
Expand Down
1,579 changes: 816 additions & 763 deletions examples/server/index.html.hpp

Large diffs are not rendered by default.

119 changes: 103 additions & 16 deletions examples/server/public/completion.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,29 @@ const paramDefaults = {
stop: ["</s>"]
};

/**
* This function completes the input text using a llama dictionary.
* @param {object} params - The parameters for the completion request.
* @param {object} controller - an instance of AbortController if you need one, or null.
* @param {function} callback - The callback function to call when the completion is done.
* @returns {string} the completed text as a string. Ideally ignored, and you get at it via the callback.
*/
export const llamaComplete = async (params, controller, callback) => {
let generation_settings = null;


// Completes the prompt as a generator. Recommended for most use cases.
//
// Example:
//
// import { llama } from '/completion.js'
//
// const request = llama("Tell me a joke", {n_predict: 800})
// for await (const chunk of request) {
// document.write(chunk.data.content)
// }
//
export async function* llama(prompt, params = {}, config = {}) {
let controller = config.controller;

if (!controller) {
controller = new AbortController();
}
const completionParams = { ...paramDefaults, ...params };

// we use fetch directly here becasue the built in fetchEventSource does not support POST
const completionParams = { ...paramDefaults, ...params, prompt };

const response = await fetch("/completion", {
method: 'POST',
body: JSON.stringify(completionParams),
Expand All @@ -36,7 +45,6 @@ export const llamaComplete = async (params, controller, callback) => {
let content = "";

try {

let cont = true;

while (cont) {
Expand All @@ -59,18 +67,21 @@ export const llamaComplete = async (params, controller, callback) => {
result.data = JSON.parse(result.data);
content += result.data.content;

// callack
if (callback) {
cont = callback(result) != false;
}
// yield
yield result;

// if we got a stop token from server, we will break here
if (result.data.stop) {
if (result.data.generation_settings) {
generation_settings = result.data.generation_settings;
}
break;
}
}
} catch (e) {
console.error("llama error: ", e);
if (e.name !== 'AbortError') {
console.error("llama error: ", e);
}
throw e;
}
finally {
Expand All @@ -79,3 +90,79 @@ export const llamaComplete = async (params, controller, callback) => {

return content;
}

// Call llama, return an event target that you can subcribe to
//
// Example:
//
// import { llamaEventTarget } from '/completion.js'
//
// const conn = llamaEventTarget(prompt)
// conn.addEventListener("message", (chunk) => {
// document.write(chunk.detail.content)
// })
//
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
const eventTarget = new EventTarget();
(async () => {
let content = "";
for await (const chunk of llama(prompt, params, config)) {
if (chunk.data) {
content += chunk.data.content;
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
}
if (chunk.data.generation_settings) {
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
}
if (chunk.data.timings) {
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
}
}
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
})();
return eventTarget;
}

// Call llama, return a promise that resolves to the completed text. This does not support streaming
//
// Example:
//
// llamaPromise(prompt).then((content) => {
// document.write(content)
// })
//
// or
//
// const content = await llamaPromise(prompt)
// document.write(content)
//
export const llamaPromise = (prompt, params = {}, config = {}) => {
return new Promise(async (resolve, reject) => {
let content = "";
try {
for await (const chunk of llama(prompt, params, config)) {
content += chunk.data.content;
}
resolve(content);
} catch (error) {
reject(error);
}
});
};

/**
* (deprecated)
*/
export const llamaComplete = async (params, controller, callback) => {
for await (const chunk of llama(params.prompt, params, { controller })) {
callback(chunk);
}
}

// Get the model info from the server. This is useful for getting the context window and so on.
export const llamaModelInfo = async () => {
if (!generation_settings) {
generation_settings = await fetch("/model.json").then(r => r.json());
}
return generation_settings;
}
Loading