@@ -95,8 +95,7 @@ class CompletionRequest:
95
95
"""
96
96
97
97
model : str
98
- prompt : str
99
- messages : Optional [List [_AbstractMessage ]]
98
+ messages : List [_AbstractMessage ]
100
99
frequency_penalty : float = 0.0
101
100
temperature : float = 0.0
102
101
stop : Optional [List [str ]] = None
@@ -121,10 +120,10 @@ class CompletionChoice:
121
120
See the "The chat completion object >>> choices" section of the OpenAI API docs for more details.
122
121
"""
123
122
124
- finish_reason : str
125
123
index : int
126
124
message : AssistantMessage
127
- logprobs : Optional [List [Any ]]
125
+ finish_reason : str = None
126
+ logprobs : Optional [List [Any ]] = None
128
127
129
128
130
129
@dataclass
@@ -151,7 +150,7 @@ class CompletionResponse:
151
150
created : int
152
151
model : str
153
152
system_fingerprint : str
154
- usage : UsageStats
153
+ usage : Optional [ UsageStats ] = None
155
154
object : str = "chat.completion"
156
155
service_tier : Optional [str ] = None
157
156
@@ -220,8 +219,13 @@ def __init__(self, *args, **kwargs):
220
219
if self .draft_model is not None
221
220
else self .model .config .max_seq_length
222
221
)
222
+ # The System fingerprint is a unique identifier for the model and its configuration.
223
+ # Currently, this is not implemented in a
224
+ self .system_fingerprint = (
225
+ self .builder_args .device + type (self .builder_args .precision ).__name__
226
+ )
223
227
224
- def completion (self , completion_request : CompletionRequest ):
228
+ def chunked_completion (self , completion_request : CompletionRequest ):
225
229
"""Handle a chat completion request and yield a chunked response.
226
230
227
231
** Warning ** : Not all arguments of the CompletionRequest are consumed as the server isn't completely implemented.
@@ -230,7 +234,8 @@ def completion(self, completion_request: CompletionRequest):
230
234
- messages: The server consumes the final element of the array as the prompt.
231
235
- model: This has no impact on the server state, i.e. changing the model in the request
232
236
will not change which model is responding. Instead, use the --model flag to seelect the model when starting the server.
233
- - temperature: This is used to control the randomness of the response. The server will use the temperature
237
+ - temperature: This is used to control the randomness of the response.
238
+ - system_fingerprint: A unique identifier for the model and its configuration. Currently unimplemented - subject to change.
234
239
235
240
See https://github.com/pytorch/torchchat/issues/973 for more details.
236
241
@@ -246,13 +251,16 @@ def completion(self, completion_request: CompletionRequest):
246
251
247
252
# Initialize counters for chunk responses and encode the prompt.
248
253
id = str (uuid .uuid4 ())
254
+
249
255
idx = 0
250
256
buffer = []
251
257
encoded = self .encode_tokens (
252
- completion_request .prompt , bos = True , device = self .builder_args .device
258
+ completion_request .messages [- 1 ].get ("content" ),
259
+ bos = True ,
260
+ device = self .builder_args .device ,
253
261
)
254
262
generator_args = GeneratorArgs (
255
- completion_request .prompt ,
263
+ completion_request .messages [ - 1 ]. get ( "content" ) ,
256
264
encoded_prompt = encoded ,
257
265
chat_mode = False ,
258
266
)
@@ -302,21 +310,45 @@ def callback(x, *, done_generating=False):
302
310
choices = [choice_chunk ],
303
311
created = int (time .time ()),
304
312
model = completion_request .model ,
305
- system_fingerprint = uuid . UUID ( int = uuid . getnode ()) ,
313
+ system_fingerprint = self . system_fingerprint ,
306
314
)
307
315
yield chunk_response
308
316
self .start_pos += y .size (0 )
309
317
idx += 1
310
318
311
319
# Yield an ending chunk indicating the generation has completed.
312
- end_chunk = CompletionChoiceChunk (ChunkDelta (None , None , None ), idx , "eos" )
320
+ end_chunk = CompletionChoiceChunk (
321
+ ChunkDelta (None , None , None ), idx , finish_reason = "stop"
322
+ )
313
323
314
324
yield CompletionResponseChunk (
315
325
id = str (id ),
316
326
choices = [end_chunk ],
317
327
created = int (time .time ()),
318
328
model = completion_request .model ,
319
- system_fingerprint = uuid .UUID (int = uuid .getnode ()),
329
+ system_fingerprint = self .system_fingerprint ,
330
+ )
331
+
332
+ def sync_completion (self , request : CompletionRequest ):
333
+ """Handle a chat completion request and yield a single, non-chunked response"""
334
+ output = ""
335
+ for chunk in self .chunked_completion (request ):
336
+ if not chunk .choices [0 ].finish_reason :
337
+ output += chunk .choices [0 ].delta .content
338
+
339
+ message = AssistantMessage (content = output )
340
+ return CompletionResponse (
341
+ id = str (uuid .uuid4 ()),
342
+ choices = [
343
+ CompletionChoice (
344
+ finish_reason = "stop" ,
345
+ index = 0 ,
346
+ message = message ,
347
+ )
348
+ ],
349
+ created = int (time .time ()),
350
+ model = request .model ,
351
+ system_fingerprint = self .system_fingerprint ,
320
352
)
321
353
322
354
def _callback (self , x , * , buffer , done_generating ):
0 commit comments