-
-
Notifications
You must be signed in to change notification settings - Fork 209
/
ollama.rb
311 lines (279 loc) · 9.44 KB
/
ollama.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# frozen_string_literal: true
module Langchain::LLM
# Interface to Ollama API.
# Available models: https://ollama.ai/library
#
# Usage:
# llm = Langchain::LLM::Ollama.new(url: ENV["OLLAMA_URL"], default_options: {})
#
class Ollama < Base
attr_reader :url, :defaults
DEFAULTS = {
temperature: 0.0,
completion_model: "llama3.2",
embedding_model: "llama3.2",
chat_model: "llama3.2",
options: {}
}.freeze
EMBEDDING_SIZES = {
codellama: 4_096,
"dolphin-mixtral": 4_096,
llama2: 4_096,
llama3: 4_096,
"llama3.1": 4_096,
"llama3.2": 4_096,
llava: 4_096,
mistral: 4_096,
"mistral-openorca": 4_096,
mixtral: 4_096,
tinydolphin: 2_048
}.freeze
# Initialize the Ollama client
# @param url [String] The URL of the Ollama instance
# @param api_key [String] The API key to use. This is optional and used when you expose Ollama API using Open WebUI
# @param default_options [Hash] The default options to use
#
def initialize(url: "http://localhost:11434", api_key: nil, default_options: {})
depends_on "faraday"
@url = url
@api_key = api_key
@defaults = DEFAULTS.merge(default_options)
chat_parameters.update(
model: {default: @defaults[:chat_model]},
temperature: {default: @defaults[:temperature]},
template: {},
stream: {default: false},
response_format: {default: @defaults[:response_format]},
options: {default: @defaults[:options]}
)
chat_parameters.remap(response_format: :format)
end
# Returns the # of vector dimensions for the embeddings
# @return [Integer] The # of vector dimensions
def default_dimensions
# since Ollama can run multiple models, look it up or generate an embedding and return the size
@default_dimensions ||=
EMBEDDING_SIZES.fetch(defaults[:embedding_model].to_sym) do
embed(text: "test").embedding.size
end
end
#
# Generate the completion for a given prompt
#
# @param prompt [String] The prompt to complete
# @param model [String] The model to use
# For a list of valid parameters and values, see:
# https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values
# @option block [Proc] Receive the intermediate responses as a stream of +OllamaResponse+ objects.
# @return [Langchain::LLM::OllamaResponse] Response object
#
# Example:
#
# final_resp = ollama.complete(prompt:) { |resp| print resp.completion }
# final_resp.total_tokens
#
def complete(
prompt:,
model: defaults[:completion_model],
images: nil,
format: nil,
system: nil,
template: nil,
context: nil,
raw: nil,
mirostat: nil,
mirostat_eta: nil,
mirostat_tau: nil,
num_ctx: nil,
num_gqa: nil,
num_gpu: nil,
num_thread: nil,
repeat_last_n: nil,
repeat_penalty: nil,
temperature: defaults[:temperature],
seed: nil,
stop: nil,
tfs_z: nil,
num_predict: nil,
top_k: nil,
top_p: nil,
stop_sequences: nil,
&block
)
if stop_sequences
stop = stop_sequences
end
parameters = {
prompt: prompt,
model: model,
images: images,
format: format,
system: system,
template: template,
context: context,
stream: block_given?, # rubocop:disable Performance/BlockGivenWithExplicitBlock
raw: raw
}.compact
llm_parameters = {
mirostat: mirostat,
mirostat_eta: mirostat_eta,
mirostat_tau: mirostat_tau,
num_ctx: num_ctx,
num_gqa: num_gqa,
num_gpu: num_gpu,
num_thread: num_thread,
repeat_last_n: repeat_last_n,
repeat_penalty: repeat_penalty,
temperature: temperature,
seed: seed,
stop: stop,
tfs_z: tfs_z,
num_predict: num_predict,
top_k: top_k,
top_p: top_p
}
parameters[:options] = llm_parameters.compact
responses_stream = []
client.post("api/generate", parameters) do |req|
req.options.on_data = json_responses_chunk_handler do |parsed_chunk|
responses_stream << parsed_chunk
block&.call(OllamaResponse.new(parsed_chunk, model: parameters[:model]))
end
end
generate_final_completion_response(responses_stream, parameters[:model])
end
# Generate a chat completion
#
# @param messages [Array] The chat messages
# @param model [String] The model to use
# @param params [Hash] Unified chat parmeters from [Langchain::LLM::Parameters::Chat::SCHEMA]
# @option params [Array<Hash>] :messages Array of messages
# @option params [String] :model Model name
# @option params [String] :format Format to return a response in. Currently the only accepted value is `json`
# @option params [Float] :temperature The temperature to use
# @option params [String] :template The prompt template to use (overrides what is defined in the `Modelfile`)
# @option block [Proc] Receive the intermediate responses as a stream of +OllamaResponse+ objects.
# @return [Langchain::LLM::OllamaResponse] Response object
#
# Example:
#
# final_resp = ollama.chat(messages:) { |resp| print resp.chat_completion }
# final_resp.total_tokens
#
# The message object has the following fields:
# role: the role of the message, either system, user or assistant
# content: the content of the message
# images (optional): a list of images to include in the message (for multimodal models such as llava)
def chat(messages:, model: nil, **params, &block)
parameters = chat_parameters.to_params(params.merge(messages:, model:, stream: block_given?)) # rubocop:disable Performance/BlockGivenWithExplicitBlock
responses_stream = []
client.post("api/chat", parameters) do |req|
req.options.on_data = json_responses_chunk_handler do |parsed_chunk|
responses_stream << parsed_chunk
block&.call(OllamaResponse.new(parsed_chunk, model: parameters[:model]))
end
end
generate_final_chat_completion_response(responses_stream, parameters[:model])
end
#
# Generate an embedding for a given text
#
# @param text [String] The text to generate an embedding for
# @param model [String] The model to use
# @param options [Hash] The options to use
# @return [Langchain::LLM::OllamaResponse] Response object
#
def embed(
text:,
model: defaults[:embedding_model],
mirostat: nil,
mirostat_eta: nil,
mirostat_tau: nil,
num_ctx: nil,
num_gqa: nil,
num_gpu: nil,
num_thread: nil,
repeat_last_n: nil,
repeat_penalty: nil,
temperature: defaults[:temperature],
seed: nil,
stop: nil,
tfs_z: nil,
num_predict: nil,
top_k: nil,
top_p: nil
)
parameters = {
model: model,
input: Array(text)
}.compact
llm_parameters = {
mirostat: mirostat,
mirostat_eta: mirostat_eta,
mirostat_tau: mirostat_tau,
num_ctx: num_ctx,
num_gqa: num_gqa,
num_gpu: num_gpu,
num_thread: num_thread,
repeat_last_n: repeat_last_n,
repeat_penalty: repeat_penalty,
temperature: temperature,
seed: seed,
stop: stop,
tfs_z: tfs_z,
num_predict: num_predict,
top_k: top_k,
top_p: top_p
}
parameters[:options] = llm_parameters.compact
response = client.post("api/embed") do |req|
req.body = parameters
end
OllamaResponse.new(response.body, model: parameters[:model])
end
# Generate a summary for a given text
#
# @param text [String] The text to generate a summary for
# @return [String] The summary
def summarize(text:)
prompt_template = Langchain::Prompt.load_from_path(
file_path: Langchain.root.join("langchain/llm/prompts/ollama/summarize_template.yaml")
)
prompt = prompt_template.format(text: text)
complete(prompt: prompt)
end
private
def client
@client ||= Faraday.new(url: url, headers: auth_headers) do |conn|
conn.request :json
conn.response :json
conn.response :raise_error
conn.response :logger, Langchain.logger, {headers: true, bodies: true, errors: true}
end
end
def auth_headers
return unless @api_key
{"Authorization" => "Bearer #{@api_key}"}
end
def json_responses_chunk_handler(&block)
proc do |chunk, _size|
chunk.split("\n").each do |chunk_line|
parsed_chunk = JSON.parse(chunk_line)
block.call(parsed_chunk)
end
end
end
def generate_final_completion_response(responses_stream, model)
final_response = responses_stream.last.merge(
"response" => responses_stream.map { |resp| resp["response"] }.join
)
OllamaResponse.new(final_response, model: model)
end
# BUG: If streamed, this method does not currently return the tool_calls response.
def generate_final_chat_completion_response(responses_stream, model)
final_response = responses_stream.last
final_response["message"]["content"] = responses_stream.map { |resp| resp.dig("message", "content") }.join
OllamaResponse.new(final_response, model: model)
end
end
end