diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json index b3aacd349..81bbbf7d1 100644 --- a/docs/static/openapi/cortex.json +++ b/docs/static/openapi/cortex.json @@ -156,7 +156,7 @@ "get": { "operationId": "HealthController_check", "summary": "Check health", - "description": "Checks the health of the application.", + "description": "Performs a comprehensive check of the application's health status.", "parameters": [], "responses": { "200": { @@ -175,7 +175,7 @@ "delete": { "operationId": "Terminate server process", "summary": "Terminate server", - "description": "Terminate server process.", + "description": "Initiates the shutdown process for the server, ensuring that all active connections are gracefully closed and any ongoing processes are properly terminated.", "parameters": [], "responses": { "200": { @@ -329,12 +329,12 @@ } }, "tags": [ - "Models" + "Pulling Models" ] }, "delete": { "tags": [ - "Models" + "Pulling Models" ], "summary": "Stop model download", "description": "Stops the download of a model with the corresponding taskId provided in the request body", @@ -449,7 +449,7 @@ } }, "tags": [ - "Models" + "Running Models" ] } }, @@ -457,7 +457,7 @@ "post": { "operationId": "ModelsController_startModel", "summary": "Start model", - "description": "Load a model into memory.", + "description": "Load a model into memory. Note: Request body parameters will override those loaded from model.yml", "requestBody": { "required": true, "content": { @@ -484,7 +484,7 @@ } }, "tags": [ - "Models" + "Running Models" ] } }, @@ -498,7 +498,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ModelStartDto" + "$ref": "#/components/schemas/ModelStopDto" }, "example": { "model": "llama3:8b-gguf-q6-k" @@ -519,7 +519,7 @@ } }, "tags": [ - "Models" + "Running Models" ] } }, @@ -552,7 +552,7 @@ } }, "tags": [ - "Models" + "Running Models" ] }, "delete": { @@ -583,7 +583,7 @@ } }, "tags": [ - "Models" + "Running Models" ] } }, @@ -625,7 +625,7 @@ } }, "tags": [ - "Models" + "Running Models" ] } }, @@ -668,7 +668,7 @@ } }, "tags": [ - "Models" + "Pulling Models" ] } }, @@ -1368,8 +1368,12 @@ "description": "These endpoints manage the lifecycle of an Assistant within a conversation thread." }, { - "name": "Models", - "description": "These endpoints provide a list and descriptions of all available models within the Cortex framework." + "name": "Pulling Models", + "description": "These endpoints handle downloading and importing models." + }, + { + "name": "Running Models", + "description": "These endpoints support a range of operations that allow users to effectively control and interact with their models" }, { "name": "Server", @@ -1399,7 +1403,8 @@ "Chat", "Engines", "Events", - "Models", + "Pulling Models", + "Running Models", "Processes", "Status", "Server" @@ -3235,8 +3240,60 @@ "type": "string", "example": "llama3:8b-gguf-q6-k", "description": "A downloaded model name." + }, + "ctx_len": { + "type": "number", + "description": "The context length for model operations varies; the maximum depends on the specific model used.", + "example": 4096 + }, + "ngl": { + "type": "number", + "description": "Determines GPU layer usage.", + "example": 32 + }, + "n_parallel": { + "type": "number", + "minimum": 1, + "description": "Number of parallel processing units to use.", + "example": 1 + }, + "cache_type": { + "type": "string", + "description": "KV cache type: f16, q8_0, q4_0, default is f16", + "example": "f16" + }, + "caching_enabled": { + "type": "boolean", + "description": "To enable prompt caching or not", + "example": true + }, + "model_path": { + "type": "string", + "description": "Local path to LLM model file", + "example": "/tmp/model.gguf" + }, + "mmproj": { + "type": "string", + "description": "Local path to mmproj model file", + "example": "/tmp/model.gguf" } - } + }, + "required": [ + "model" + ] + }, + "ModelStopDto": { + "type": "object", + "properties": { + "model": { + "type": "string", + "example": "llama3:8b-gguf-q6-k", + "description": "A downloaded model name." + } + }, + "required": [ + "model" + ] }, "ImportModelRequest": { "type": "object", @@ -3256,7 +3313,10 @@ "option": { "type": "string", "description": "Import options such as symlink or copy.", - "enum": ["symlink", "copy"] + "enum": [ + "symlink", + "copy" + ] } }, "required": [ @@ -3551,7 +3611,161 @@ }, "UpdateModelDto": { "type": "object", - "properties": {} + "properties": { + "files": { + "type": "array", + "description": "List of file paths associated with the model. Can be relative or absolute.", + "items": { + "type": "string", + "example": "models\\cortex.so\\tinyllama\\1b-gguf\\model.gguf" + } + }, + "stop": { + "type": "array", + "description": "Tokens that signal the end of generation.", + "items": { + "type": "string" + }, + "example": [ + "" + ] + }, + "stream": { + "type": "boolean", + "description": "Whether to stream the output as it is generated.", + "example": true + }, + "top_p": { + "type": "number", + "description": "Controls nucleus sampling; the model considers the results of the tokens with top_p probability mass.", + "example": 0.95 + }, + "temperature": { + "type": "number", + "description": "Controls randomness in token selection; lower values make the output more deterministic.", + "example": 0.7 + }, + "frequency_penalty": { + "type": "number", + "description": "Penalizes repeated tokens based on their frequency.", + "example": 0 + }, + "presence_penalty": { + "type": "number", + "description": "Penalizes tokens that have already appeared in the output.", + "example": 0 + }, + "max_tokens": { + "type": "integer", + "description": "Maximum number of tokens to generate.", + "example": 4096 + }, + "seed": { + "type": "integer", + "description": "Seed for random number generation to ensure reproducibility; -1 for random seed.", + "example": -1 + }, + "dynatemp_range": { + "type": "number", + "description": "Range for dynamic temperature adjustment.", + "example": 0 + }, + "dynatemp_exponent": { + "type": "number", + "description": "Exponent for dynamic temperature adjustment.", + "example": 1 + }, + "top_k": { + "type": "integer", + "description": "Limits the sampling pool to the top_k most probable tokens.", + "example": 40 + }, + "min_p": { + "type": "number", + "description": "Minimum probability threshold for token selection.", + "example": 0.05 + }, + "tfs_z": { + "type": "number", + "description": "Threshold for token frequency sampling.", + "example": 1 + }, + "typ_p": { + "type": "number", + "description": "Controls typical sampling; similar to top_p but focuses on local token distribution.", + "example": 1 + }, + "repeat_last_n": { + "type": "integer", + "description": "Number of recent tokens to consider for repetition penalty.", + "example": 64 + }, + "repeat_penalty": { + "type": "number", + "description": "Penalty applied to repeated tokens.", + "example": 1 + }, + "mirostat": { + "type": "boolean", + "description": "Enables or disables Mirostat sampling.", + "example": false + }, + "mirostat_tau": { + "type": "number", + "description": "Target entropy for Mirostat sampling.", + "example": 5 + }, + "mirostat_eta": { + "type": "number", + "description": "Learning rate for Mirostat sampling.", + "example": 0.1 + }, + "penalize_nl": { + "type": "boolean", + "description": "Whether to penalize newline tokens.", + "example": false + }, + "ignore_eos": { + "type": "boolean", + "description": "Whether to ignore end-of-sequence tokens during generation.", + "example": false + }, + "n_probs": { + "type": "integer", + "description": "Number of probabilities to consider for each token.", + "example": 0 + }, + "min_keep": { + "type": "integer", + "description": "Minimum number of tokens to keep in the buffer.", + "example": 0 + }, + "engine": { + "type": "string", + "description": "The engine used to run the model.", + "example": "llama-cpp" + }, + "prompt_template": { + "type": "string", + "description": "Template used for formatting prompts.", + "example": "\n\n<|system|>\n{system_message}\n\n\n\n\n<|user|>\n{prompt}\n\n\n<|assistant|>\n\n" + }, + "ctx_len": { + "type": "integer", + "description": "Context length for the model.", + "example": 4096 + }, + "n_parallel": { + "type": "integer", + "description": "Number of parallel threads for execution.", + "example": 1 + }, + "ngl": { + "type": "integer", + "description": "Number of GPU layers.", + "example": 33 + } + } }, "DeleteModelResponseDto": { "type": "object",