Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Running Models API reference #1625

Merged
merged 13 commits into from
Nov 5, 2024
252 changes: 233 additions & 19 deletions docs/static/openapi/cortex.json
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@
"get": {
"operationId": "HealthController_check",
"summary": "Check health",
"description": "Checks the health of the application.",
"description": "Performs a comprehensive check of the application's health status.",
"parameters": [],
"responses": {
"200": {
Expand All @@ -175,7 +175,7 @@
"delete": {
"operationId": "Terminate server process",
"summary": "Terminate server",
"description": "Terminate server process.",
"description": "Initiates the shutdown process for the server, ensuring that all active connections are gracefully closed and any ongoing processes are properly terminated.",
"parameters": [],
"responses": {
"200": {
Expand Down Expand Up @@ -329,12 +329,12 @@
}
},
"tags": [
"Models"
"Pulling Models"
]
},
"delete": {
"tags": [
"Models"
"Pulling Models"
],
"summary": "Stop model download",
"description": "Stops the download of a model with the corresponding taskId provided in the request body",
Expand Down Expand Up @@ -449,15 +449,15 @@
}
},
"tags": [
"Models"
"Running Models"
]
}
},
"/v1/models/start": {
"post": {
"operationId": "ModelsController_startModel",
"summary": "Start model",
"description": "Load a model into memory.",
"description": "Load a model into memory. Note: Request body parameters will override those loaded from model.yml",
"requestBody": {
"required": true,
"content": {
Expand All @@ -484,7 +484,7 @@
}
},
"tags": [
"Models"
"Running Models"
]
}
},
Expand All @@ -498,7 +498,7 @@
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ModelStartDto"
"$ref": "#/components/schemas/ModelStopDto"
},
"example": {
"model": "llama3:8b-gguf-q6-k"
Expand All @@ -519,7 +519,7 @@
}
},
"tags": [
"Models"
"Running Models"
]
}
},
Expand Down Expand Up @@ -552,7 +552,7 @@
}
},
"tags": [
"Models"
"Running Models"
]
},
"delete": {
Expand Down Expand Up @@ -583,7 +583,7 @@
}
},
"tags": [
"Models"
"Running Models"
]
}
},
Expand Down Expand Up @@ -625,7 +625,7 @@
}
},
"tags": [
"Models"
"Running Models"
]
}
},
Expand Down Expand Up @@ -668,7 +668,7 @@
}
},
"tags": [
"Models"
"Pulling Models"
]
}
},
Expand Down Expand Up @@ -1368,8 +1368,12 @@
"description": "These endpoints manage the lifecycle of an Assistant within a conversation thread."
},
{
"name": "Models",
"description": "These endpoints provide a list and descriptions of all available models within the Cortex framework."
"name": "Pulling Models",
"description": "These endpoints handle downloading and importing models."
},
{
"name": "Running Models",
"description": "These endpoints support a range of operations that allow users to effectively control and interact with their models"
},
{
"name": "Server",
Expand Down Expand Up @@ -1399,7 +1403,8 @@
"Chat",
"Engines",
"Events",
"Models",
"Pulling Models",
"Running Models",
"Processes",
"Status",
"Server"
Expand Down Expand Up @@ -3235,8 +3240,60 @@
"type": "string",
"example": "llama3:8b-gguf-q6-k",
"description": "A downloaded model name."
},
"ctx_len": {
"type": "number",
"description": "The context length for model operations varies; the maximum depends on the specific model used.",
"example": 4096
},
"ngl": {
"type": "number",
"description": "Determines GPU layer usage.",
"example": 32
},
"n_parallel": {
"type": "number",
"minimum": 1,
"description": "Number of parallel processing units to use.",
"example": 1
},
"cache_type": {
"type": "string",
"description": "KV cache type: f16, q8_0, q4_0, default is f16",
"example": "f16"
},
"caching_enabled": {
"type": "boolean",
"description": "To enable prompt caching or not",
"example": true
},
"model_path": {
"type": "string",
"description": "Local path to LLM model file",
"example": "/tmp/model.gguf"
},
"mmproj": {
"type": "string",
"description": "Local path to mmproj model file",
"example": "/tmp/model.gguf"
}
}
},
"required": [
"model"
]
},
"ModelStopDto": {
"type": "object",
"properties": {
"model": {
"type": "string",
"example": "llama3:8b-gguf-q6-k",
"description": "A downloaded model name."
}
},
"required": [
"model"
]
},
"ImportModelRequest": {
"type": "object",
Expand All @@ -3256,7 +3313,10 @@
"option": {
"type": "string",
"description": "Import options such as symlink or copy.",
"enum": ["symlink", "copy"]
"enum": [
"symlink",
"copy"
]
}
},
"required": [
Expand Down Expand Up @@ -3551,7 +3611,161 @@
},
"UpdateModelDto": {
"type": "object",
"properties": {}
"properties": {
"files": {
"type": "array",
"description": "List of file paths associated with the model. Can be relative or absolute.",
"items": {
"type": "string",
"example": "models\\cortex.so\\tinyllama\\1b-gguf\\model.gguf"
}
},
"stop": {
"type": "array",
"description": "Tokens that signal the end of generation.",
"items": {
"type": "string"
},
"example": [
"</s>"
]
},
"stream": {
"type": "boolean",
"description": "Whether to stream the output as it is generated.",
"example": true
},
"top_p": {
"type": "number",
"description": "Controls nucleus sampling; the model considers the results of the tokens with top_p probability mass.",
"example": 0.95
},
"temperature": {
"type": "number",
"description": "Controls randomness in token selection; lower values make the output more deterministic.",
"example": 0.7
},
"frequency_penalty": {
"type": "number",
"description": "Penalizes repeated tokens based on their frequency.",
"example": 0
},
"presence_penalty": {
"type": "number",
"description": "Penalizes tokens that have already appeared in the output.",
"example": 0
},
"max_tokens": {
"type": "integer",
"description": "Maximum number of tokens to generate.",
"example": 4096
},
"seed": {
"type": "integer",
"description": "Seed for random number generation to ensure reproducibility; -1 for random seed.",
"example": -1
},
"dynatemp_range": {
"type": "number",
"description": "Range for dynamic temperature adjustment.",
"example": 0
},
"dynatemp_exponent": {
"type": "number",
"description": "Exponent for dynamic temperature adjustment.",
"example": 1
},
"top_k": {
"type": "integer",
"description": "Limits the sampling pool to the top_k most probable tokens.",
"example": 40
},
"min_p": {
"type": "number",
"description": "Minimum probability threshold for token selection.",
"example": 0.05
},
"tfs_z": {
"type": "number",
"description": "Threshold for token frequency sampling.",
"example": 1
},
"typ_p": {
"type": "number",
"description": "Controls typical sampling; similar to top_p but focuses on local token distribution.",
"example": 1
},
"repeat_last_n": {
"type": "integer",
"description": "Number of recent tokens to consider for repetition penalty.",
"example": 64
},
"repeat_penalty": {
"type": "number",
"description": "Penalty applied to repeated tokens.",
"example": 1
},
"mirostat": {
"type": "boolean",
"description": "Enables or disables Mirostat sampling.",
"example": false
},
"mirostat_tau": {
"type": "number",
"description": "Target entropy for Mirostat sampling.",
"example": 5
},
"mirostat_eta": {
"type": "number",
"description": "Learning rate for Mirostat sampling.",
"example": 0.1
},
"penalize_nl": {
"type": "boolean",
"description": "Whether to penalize newline tokens.",
"example": false
},
"ignore_eos": {
"type": "boolean",
"description": "Whether to ignore end-of-sequence tokens during generation.",
"example": false
},
"n_probs": {
"type": "integer",
"description": "Number of probabilities to consider for each token.",
"example": 0
},
"min_keep": {
"type": "integer",
"description": "Minimum number of tokens to keep in the buffer.",
"example": 0
},
"engine": {
"type": "string",
"description": "The engine used to run the model.",
"example": "llama-cpp"
},
"prompt_template": {
"type": "string",
"description": "Template used for formatting prompts.",
"example": "\n\n<|system|>\n{system_message}</s>\n\n\n\n\n<|user|>\n{prompt}</s>\n\n\n<|assistant|>\n\n"
},
"ctx_len": {
"type": "integer",
"description": "Context length for the model.",
"example": 4096
},
"n_parallel": {
"type": "integer",
"description": "Number of parallel threads for execution.",
"example": 1
},
"ngl": {
"type": "integer",
"description": "Number of GPU layers.",
"example": 33
}
}
},
"DeleteModelResponseDto": {
"type": "object",
Expand Down
Loading