-
Notifications
You must be signed in to change notification settings - Fork 0
/
llama_test.py
104 lines (87 loc) · 3.12 KB
/
llama_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from llama_cpp import Llama
import torch
ASSISTANT_MODE = True
NAME_2 = "John"
NAME_1 = "Jane"
SYS_PROMPT = "A conversation between " + NAME_2 + " and " + NAME_1 + ". The user and assistant each take roles as one of these interlocuters. Afterwards, the user tests if the assistant can tell who is who."
# Change the model path to the path of the model you want to use
# Change n_gpu_layers to the number of layers you are able to offload to your gpu
llama_model = Llama(model_path="./Meta-Llama-3-8B-Instruct-Q6_K.gguf", n_gpu_layers=33, verbose=False, logits_all=True)
vocab_size = 128256
final_result = ""
if ASSISTANT_MODE == True:
final_result += "<|begin_of_text|><|start_header_id|>system<|end_header_id|>" + SYS_PROMPT + "<|eot_id|>\n"
for i in range(4):
s_in = input()
# Replace all \n with '\n' (actual newline characters)
s_in = s_in.replace("\\n", "\n")
print("\n", end='', flush=True)
final_result = final_result + "<|start_header_id|>" + NAME_1 + "<|end_header_id|>" + s_in + "<|eot_id|>\n" + "<|start_header_id|>" + NAME_2 + "<|end_header_id|>"
prompt = final_result
max_tokens = 4096
temperature = 0.3
top_p = 0.1
echo = True
stop = ["<|eot_id|>"]
# Define the parameters
for token in llama_model(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
echo=echo,
stop=stop,
stream=True,
):
s = token["choices"][0]["text"]
final_result += s
print(s, end='', flush=True)
final_result += "<|eot_id|>\n"
print("\n", end='', flush=True)
# Test if assistant can tell who is who
s_in = "Are you John or are you Jane? Please answer in a single word."
print(s_in, end='', flush=True)
print("\n", end='', flush=True)
final_result = final_result + "<|start_header_id|>user<|end_header_id|>" + s_in + "<|eot_id|>\n" + "<|start_header_id|>assistant<|end_header_id|>"
prompt = final_result
cnt = 0
for token in llama_model(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
echo=echo,
stop=stop,
stream=True,
logprobs=vocab_size
):
if cnt > 0:
break
cnt += 1
s = token["choices"][0]["text"]
final_result += s
print(s, end='', flush=True)
# Print actual probabilities of top 5 tokens
logprobs = token["choices"][0]["logprobs"]["top_logprobs"][0]
# Compute softmax
logprobs_vals = torch.tensor(list(logprobs.values()))
probs = torch.nn.functional.softmax(logprobs_vals, dim=0)
probs = probs.tolist()
# Get top 5 tokens
top_5_tokens = list(logprobs.keys())
top_5_probs = probs
print("\nTop 5 tokens:")
for i in range(5):
s_print = ''
if top_5_tokens[i] == "\n":
s_print = "'\\n'"
elif top_5_tokens[i] == "\t":
s_print = "'\\t'"
elif top_5_tokens[i] == "\n\n":
s_print = "'\\n\\n'"
else:
s_print = "'" + top_5_tokens[i] + "'"
print(s_print, " "*(10 - len(s_print)) + str(top_5_probs[i]))
print("\n\n")
print("A printout of the complete conversation, including all special tokens.")
print(final_result)