|
156 | 156 | "get": {
|
157 | 157 | "operationId": "HealthController_check",
|
158 | 158 | "summary": "Check health",
|
159 |
| - "description": "Checks the health of the application.", |
| 159 | + "description": "Performs a comprehensive check of the application's health status.", |
160 | 160 | "parameters": [],
|
161 | 161 | "responses": {
|
162 | 162 | "200": {
|
|
175 | 175 | "delete": {
|
176 | 176 | "operationId": "Terminate server process",
|
177 | 177 | "summary": "Terminate server",
|
178 |
| - "description": "Terminate server process.", |
| 178 | + "description": "Initiates the shutdown process for the server, ensuring that all active connections are gracefully closed and any ongoing processes are properly terminated.", |
179 | 179 | "parameters": [],
|
180 | 180 | "responses": {
|
181 | 181 | "200": {
|
|
329 | 329 | }
|
330 | 330 | },
|
331 | 331 | "tags": [
|
332 |
| - "Models" |
| 332 | + "Pulling Models" |
333 | 333 | ]
|
334 | 334 | },
|
335 | 335 | "delete": {
|
336 | 336 | "tags": [
|
337 |
| - "Models" |
| 337 | + "Pulling Models" |
338 | 338 | ],
|
339 | 339 | "summary": "Stop model download",
|
340 | 340 | "description": "Stops the download of a model with the corresponding taskId provided in the request body",
|
|
449 | 449 | }
|
450 | 450 | },
|
451 | 451 | "tags": [
|
452 |
| - "Models" |
| 452 | + "Running Models" |
453 | 453 | ]
|
454 | 454 | }
|
455 | 455 | },
|
456 | 456 | "/v1/models/start": {
|
457 | 457 | "post": {
|
458 | 458 | "operationId": "ModelsController_startModel",
|
459 | 459 | "summary": "Start model",
|
460 |
| - "description": "Load a model into memory.", |
| 460 | + "description": "Load a model into memory. Note: Request body parameters will override those loaded from model.yml", |
461 | 461 | "requestBody": {
|
462 | 462 | "required": true,
|
463 | 463 | "content": {
|
|
484 | 484 | }
|
485 | 485 | },
|
486 | 486 | "tags": [
|
487 |
| - "Models" |
| 487 | + "Running Models" |
488 | 488 | ]
|
489 | 489 | }
|
490 | 490 | },
|
|
498 | 498 | "content": {
|
499 | 499 | "application/json": {
|
500 | 500 | "schema": {
|
501 |
| - "$ref": "#/components/schemas/ModelStartDto" |
| 501 | + "$ref": "#/components/schemas/ModelStopDto" |
502 | 502 | },
|
503 | 503 | "example": {
|
504 | 504 | "model": "llama3:8b-gguf-q6-k"
|
|
519 | 519 | }
|
520 | 520 | },
|
521 | 521 | "tags": [
|
522 |
| - "Models" |
| 522 | + "Running Models" |
523 | 523 | ]
|
524 | 524 | }
|
525 | 525 | },
|
|
552 | 552 | }
|
553 | 553 | },
|
554 | 554 | "tags": [
|
555 |
| - "Models" |
| 555 | + "Running Models" |
556 | 556 | ]
|
557 | 557 | },
|
558 | 558 | "delete": {
|
|
583 | 583 | }
|
584 | 584 | },
|
585 | 585 | "tags": [
|
586 |
| - "Models" |
| 586 | + "Running Models" |
587 | 587 | ]
|
588 | 588 | }
|
589 | 589 | },
|
|
625 | 625 | }
|
626 | 626 | },
|
627 | 627 | "tags": [
|
628 |
| - "Models" |
| 628 | + "Running Models" |
629 | 629 | ]
|
630 | 630 | }
|
631 | 631 | },
|
|
668 | 668 | }
|
669 | 669 | },
|
670 | 670 | "tags": [
|
671 |
| - "Models" |
| 671 | + "Pulling Models" |
672 | 672 | ]
|
673 | 673 | }
|
674 | 674 | },
|
|
1368 | 1368 | "description": "These endpoints manage the lifecycle of an Assistant within a conversation thread."
|
1369 | 1369 | },
|
1370 | 1370 | {
|
1371 |
| - "name": "Models", |
1372 |
| - "description": "These endpoints provide a list and descriptions of all available models within the Cortex framework." |
| 1371 | + "name": "Pulling Models", |
| 1372 | + "description": "These endpoints handle downloading and importing models." |
| 1373 | + }, |
| 1374 | + { |
| 1375 | + "name": "Running Models", |
| 1376 | + "description": "These endpoints support a range of operations that allow users to effectively control and interact with their models" |
1373 | 1377 | },
|
1374 | 1378 | {
|
1375 | 1379 | "name": "Server",
|
|
1399 | 1403 | "Chat",
|
1400 | 1404 | "Engines",
|
1401 | 1405 | "Events",
|
1402 |
| - "Models", |
| 1406 | + "Pulling Models", |
| 1407 | + "Running Models", |
1403 | 1408 | "Processes",
|
1404 | 1409 | "Status",
|
1405 | 1410 | "Server"
|
|
3235 | 3240 | "type": "string",
|
3236 | 3241 | "example": "llama3:8b-gguf-q6-k",
|
3237 | 3242 | "description": "A downloaded model name."
|
| 3243 | + }, |
| 3244 | + "ctx_len": { |
| 3245 | + "type": "number", |
| 3246 | + "description": "The context length for model operations varies; the maximum depends on the specific model used.", |
| 3247 | + "example": 4096 |
| 3248 | + }, |
| 3249 | + "ngl": { |
| 3250 | + "type": "number", |
| 3251 | + "description": "Determines GPU layer usage.", |
| 3252 | + "example": 32 |
| 3253 | + }, |
| 3254 | + "n_parallel": { |
| 3255 | + "type": "number", |
| 3256 | + "minimum": 1, |
| 3257 | + "description": "Number of parallel processing units to use.", |
| 3258 | + "example": 1 |
| 3259 | + }, |
| 3260 | + "cache_type": { |
| 3261 | + "type": "string", |
| 3262 | + "description": "KV cache type: f16, q8_0, q4_0, default is f16", |
| 3263 | + "example": "f16" |
| 3264 | + }, |
| 3265 | + "caching_enabled": { |
| 3266 | + "type": "boolean", |
| 3267 | + "description": "To enable prompt caching or not", |
| 3268 | + "example": true |
| 3269 | + }, |
| 3270 | + "model_path": { |
| 3271 | + "type": "string", |
| 3272 | + "description": "Local path to LLM model file", |
| 3273 | + "example": "/tmp/model.gguf" |
| 3274 | + }, |
| 3275 | + "mmproj": { |
| 3276 | + "type": "string", |
| 3277 | + "description": "Local path to mmproj model file", |
| 3278 | + "example": "/tmp/model.gguf" |
3238 | 3279 | }
|
3239 |
| - } |
| 3280 | + }, |
| 3281 | + "required": [ |
| 3282 | + "model" |
| 3283 | + ] |
| 3284 | + }, |
| 3285 | + "ModelStopDto": { |
| 3286 | + "type": "object", |
| 3287 | + "properties": { |
| 3288 | + "model": { |
| 3289 | + "type": "string", |
| 3290 | + "example": "llama3:8b-gguf-q6-k", |
| 3291 | + "description": "A downloaded model name." |
| 3292 | + } |
| 3293 | + }, |
| 3294 | + "required": [ |
| 3295 | + "model" |
| 3296 | + ] |
3240 | 3297 | },
|
3241 | 3298 | "ImportModelRequest": {
|
3242 | 3299 | "type": "object",
|
|
3256 | 3313 | "option": {
|
3257 | 3314 | "type": "string",
|
3258 | 3315 | "description": "Import options such as symlink or copy.",
|
3259 |
| - "enum": ["symlink", "copy"] |
| 3316 | + "enum": [ |
| 3317 | + "symlink", |
| 3318 | + "copy" |
| 3319 | + ] |
3260 | 3320 | }
|
3261 | 3321 | },
|
3262 | 3322 | "required": [
|
|
3551 | 3611 | },
|
3552 | 3612 | "UpdateModelDto": {
|
3553 | 3613 | "type": "object",
|
3554 |
| - "properties": {} |
| 3614 | + "properties": { |
| 3615 | + "files": { |
| 3616 | + "type": "array", |
| 3617 | + "description": "List of file paths associated with the model. Can be relative or absolute.", |
| 3618 | + "items": { |
| 3619 | + "type": "string", |
| 3620 | + "example": "models\\cortex.so\\tinyllama\\1b-gguf\\model.gguf" |
| 3621 | + } |
| 3622 | + }, |
| 3623 | + "stop": { |
| 3624 | + "type": "array", |
| 3625 | + "description": "Tokens that signal the end of generation.", |
| 3626 | + "items": { |
| 3627 | + "type": "string" |
| 3628 | + }, |
| 3629 | + "example": [ |
| 3630 | + "</s>" |
| 3631 | + ] |
| 3632 | + }, |
| 3633 | + "stream": { |
| 3634 | + "type": "boolean", |
| 3635 | + "description": "Whether to stream the output as it is generated.", |
| 3636 | + "example": true |
| 3637 | + }, |
| 3638 | + "top_p": { |
| 3639 | + "type": "number", |
| 3640 | + "description": "Controls nucleus sampling; the model considers the results of the tokens with top_p probability mass.", |
| 3641 | + "example": 0.95 |
| 3642 | + }, |
| 3643 | + "temperature": { |
| 3644 | + "type": "number", |
| 3645 | + "description": "Controls randomness in token selection; lower values make the output more deterministic.", |
| 3646 | + "example": 0.7 |
| 3647 | + }, |
| 3648 | + "frequency_penalty": { |
| 3649 | + "type": "number", |
| 3650 | + "description": "Penalizes repeated tokens based on their frequency.", |
| 3651 | + "example": 0 |
| 3652 | + }, |
| 3653 | + "presence_penalty": { |
| 3654 | + "type": "number", |
| 3655 | + "description": "Penalizes tokens that have already appeared in the output.", |
| 3656 | + "example": 0 |
| 3657 | + }, |
| 3658 | + "max_tokens": { |
| 3659 | + "type": "integer", |
| 3660 | + "description": "Maximum number of tokens to generate.", |
| 3661 | + "example": 4096 |
| 3662 | + }, |
| 3663 | + "seed": { |
| 3664 | + "type": "integer", |
| 3665 | + "description": "Seed for random number generation to ensure reproducibility; -1 for random seed.", |
| 3666 | + "example": -1 |
| 3667 | + }, |
| 3668 | + "dynatemp_range": { |
| 3669 | + "type": "number", |
| 3670 | + "description": "Range for dynamic temperature adjustment.", |
| 3671 | + "example": 0 |
| 3672 | + }, |
| 3673 | + "dynatemp_exponent": { |
| 3674 | + "type": "number", |
| 3675 | + "description": "Exponent for dynamic temperature adjustment.", |
| 3676 | + "example": 1 |
| 3677 | + }, |
| 3678 | + "top_k": { |
| 3679 | + "type": "integer", |
| 3680 | + "description": "Limits the sampling pool to the top_k most probable tokens.", |
| 3681 | + "example": 40 |
| 3682 | + }, |
| 3683 | + "min_p": { |
| 3684 | + "type": "number", |
| 3685 | + "description": "Minimum probability threshold for token selection.", |
| 3686 | + "example": 0.05 |
| 3687 | + }, |
| 3688 | + "tfs_z": { |
| 3689 | + "type": "number", |
| 3690 | + "description": "Threshold for token frequency sampling.", |
| 3691 | + "example": 1 |
| 3692 | + }, |
| 3693 | + "typ_p": { |
| 3694 | + "type": "number", |
| 3695 | + "description": "Controls typical sampling; similar to top_p but focuses on local token distribution.", |
| 3696 | + "example": 1 |
| 3697 | + }, |
| 3698 | + "repeat_last_n": { |
| 3699 | + "type": "integer", |
| 3700 | + "description": "Number of recent tokens to consider for repetition penalty.", |
| 3701 | + "example": 64 |
| 3702 | + }, |
| 3703 | + "repeat_penalty": { |
| 3704 | + "type": "number", |
| 3705 | + "description": "Penalty applied to repeated tokens.", |
| 3706 | + "example": 1 |
| 3707 | + }, |
| 3708 | + "mirostat": { |
| 3709 | + "type": "boolean", |
| 3710 | + "description": "Enables or disables Mirostat sampling.", |
| 3711 | + "example": false |
| 3712 | + }, |
| 3713 | + "mirostat_tau": { |
| 3714 | + "type": "number", |
| 3715 | + "description": "Target entropy for Mirostat sampling.", |
| 3716 | + "example": 5 |
| 3717 | + }, |
| 3718 | + "mirostat_eta": { |
| 3719 | + "type": "number", |
| 3720 | + "description": "Learning rate for Mirostat sampling.", |
| 3721 | + "example": 0.1 |
| 3722 | + }, |
| 3723 | + "penalize_nl": { |
| 3724 | + "type": "boolean", |
| 3725 | + "description": "Whether to penalize newline tokens.", |
| 3726 | + "example": false |
| 3727 | + }, |
| 3728 | + "ignore_eos": { |
| 3729 | + "type": "boolean", |
| 3730 | + "description": "Whether to ignore end-of-sequence tokens during generation.", |
| 3731 | + "example": false |
| 3732 | + }, |
| 3733 | + "n_probs": { |
| 3734 | + "type": "integer", |
| 3735 | + "description": "Number of probabilities to consider for each token.", |
| 3736 | + "example": 0 |
| 3737 | + }, |
| 3738 | + "min_keep": { |
| 3739 | + "type": "integer", |
| 3740 | + "description": "Minimum number of tokens to keep in the buffer.", |
| 3741 | + "example": 0 |
| 3742 | + }, |
| 3743 | + "engine": { |
| 3744 | + "type": "string", |
| 3745 | + "description": "The engine used to run the model.", |
| 3746 | + "example": "llama-cpp" |
| 3747 | + }, |
| 3748 | + "prompt_template": { |
| 3749 | + "type": "string", |
| 3750 | + "description": "Template used for formatting prompts.", |
| 3751 | + "example": "\n\n<|system|>\n{system_message}</s>\n\n\n\n\n<|user|>\n{prompt}</s>\n\n\n<|assistant|>\n\n" |
| 3752 | + }, |
| 3753 | + "ctx_len": { |
| 3754 | + "type": "integer", |
| 3755 | + "description": "Context length for the model.", |
| 3756 | + "example": 4096 |
| 3757 | + }, |
| 3758 | + "n_parallel": { |
| 3759 | + "type": "integer", |
| 3760 | + "description": "Number of parallel threads for execution.", |
| 3761 | + "example": 1 |
| 3762 | + }, |
| 3763 | + "ngl": { |
| 3764 | + "type": "integer", |
| 3765 | + "description": "Number of GPU layers.", |
| 3766 | + "example": 33 |
| 3767 | + } |
| 3768 | + } |
3555 | 3769 | },
|
3556 | 3770 | "DeleteModelResponseDto": {
|
3557 | 3771 | "type": "object",
|
|
0 commit comments