Skip to content

Commit

Permalink
examples : server chat mode with llama2 (#2400)
Browse files Browse the repository at this point in the history
* add: server chat mode with llama2

* fix: remove the unnecessary last \n
  • Loading branch information
nhamanasu authored Jul 28, 2023
1 parent d91f3f0 commit 34ae1ca
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 0 deletions.
26 changes: 26 additions & 0 deletions examples/server-llama2-13B.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

set -e

cd "$(dirname "$0")/.." || exit

# Specify the model you want to use here:
MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}

# Adjust to the number of CPU cores you want to use.
N_THREAD="${N_THREAD:-12}"

# Note: you can also override the generation options by specifying them on the command line:
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"


# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./server $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--rope-freq-scale 1.0 \
"$@"

# I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line.
# -ngl 1 \
109 changes: 109 additions & 0 deletions examples/server/chat-llama2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/bin/bash

API_URL="${API_URL:-http://127.0.0.1:8080}"

CHAT=(
"Hello, Assistant."
"Hello. How may I help you today?"
)

INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."

trim() {
shopt -s extglob
set -- "${1##+([[:space:]])}"
printf "%s" "${1%%+([[:space:]])}"
}

trim_trailing() {
shopt -s extglob
printf "%s" "${1%%+([[:space:]])}"
}

format_prompt() {
if [[ "${#CHAT[@]}" -eq 0 ]]; then
echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
else
LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
fi
}

tokenize() {
curl \
--silent \
--request POST \
--url "${API_URL}/tokenize" \
--header "Content-Type: application/json" \
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
| jq '.tokens[]'
}

N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)

chat_completion() {
PROMPT="$(trim_trailing "$(format_prompt "$1")")"
DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
prompt: .,
temperature: 0.2,
top_k: 40,
top_p: 0.9,
n_keep: $n_keep,
n_predict: 1024,
stop: ["[INST]"],
stream: true
}')"

# Create a temporary file to hold the Python output
TEMPFILE=$(mktemp)

exec 3< <(curl \
--silent \
--no-buffer \
--request POST \
--url "${API_URL}/completion" \
--header "Content-Type: application/json" \
--data-raw "${DATA}")

python -c "
import json
import sys
answer = ''
while True:
line = sys.stdin.readline()
if not line:
break
if line.startswith('data: '):
json_content = line[6:].strip()
content = json.loads(json_content)['content']
sys.stdout.write(content)
sys.stdout.flush()
answer += content
answer = answer.rstrip('\n')
# Write the answer to the temporary file
with open('$TEMPFILE', 'w') as f:
f.write(answer)
" <&3

exec 3<&-

# Read the answer from the temporary file
ANSWER=$(cat $TEMPFILE)

# Clean up the temporary file
rm $TEMPFILE

printf "\n"

CHAT+=("$1" "$(trim "$ANSWER")")
}

while true; do
echo -en "\033[0;32m" # Green color
read -r -e -p "> " QUESTION
echo -en "\033[0m" # Reset color
chat_completion "${QUESTION}"
done

1 comment on commit 34ae1ca

@jxy
Copy link
Contributor

@jxy jxy commented on 34ae1ca Jul 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please note that this dose not exactly conform to the llama code due to missing EOS/BOS, c.f.
https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L249-L252

I don't know what we can do with main, but for server, you may use #2306

Please sign in to comment.