Skip to content

Commit

Permalink
Merge pull request #1625 from janhq/chore/models-api
Browse files Browse the repository at this point in the history
chore: Running Models API reference
  • Loading branch information
gabrielle-ong authored Nov 5, 2024
2 parents 5338a78 + 9cc9e5d commit 5ca8257
Showing 1 changed file with 233 additions and 19 deletions.
252 changes: 233 additions & 19 deletions docs/static/openapi/cortex.json
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@
"get": {
"operationId": "HealthController_check",
"summary": "Check health",
"description": "Checks the health of the application.",
"description": "Performs a comprehensive check of the application's health status.",
"parameters": [],
"responses": {
"200": {
Expand All @@ -175,7 +175,7 @@
"delete": {
"operationId": "Terminate server process",
"summary": "Terminate server",
"description": "Terminate server process.",
"description": "Initiates the shutdown process for the server, ensuring that all active connections are gracefully closed and any ongoing processes are properly terminated.",
"parameters": [],
"responses": {
"200": {
Expand Down Expand Up @@ -329,12 +329,12 @@
}
},
"tags": [
"Models"
"Pulling Models"
]
},
"delete": {
"tags": [
"Models"
"Pulling Models"
],
"summary": "Stop model download",
"description": "Stops the download of a model with the corresponding taskId provided in the request body",
Expand Down Expand Up @@ -449,15 +449,15 @@
}
},
"tags": [
"Models"
"Running Models"
]
}
},
"/v1/models/start": {
"post": {
"operationId": "ModelsController_startModel",
"summary": "Start model",
"description": "Load a model into memory.",
"description": "Load a model into memory. Note: Request body parameters will override those loaded from model.yml",
"requestBody": {
"required": true,
"content": {
Expand All @@ -484,7 +484,7 @@
}
},
"tags": [
"Models"
"Running Models"
]
}
},
Expand All @@ -498,7 +498,7 @@
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ModelStartDto"
"$ref": "#/components/schemas/ModelStopDto"
},
"example": {
"model": "llama3:8b-gguf-q6-k"
Expand All @@ -519,7 +519,7 @@
}
},
"tags": [
"Models"
"Running Models"
]
}
},
Expand Down Expand Up @@ -552,7 +552,7 @@
}
},
"tags": [
"Models"
"Running Models"
]
},
"delete": {
Expand Down Expand Up @@ -583,7 +583,7 @@
}
},
"tags": [
"Models"
"Running Models"
]
}
},
Expand Down Expand Up @@ -625,7 +625,7 @@
}
},
"tags": [
"Models"
"Running Models"
]
}
},
Expand Down Expand Up @@ -668,7 +668,7 @@
}
},
"tags": [
"Models"
"Pulling Models"
]
}
},
Expand Down Expand Up @@ -1368,8 +1368,12 @@
"description": "These endpoints manage the lifecycle of an Assistant within a conversation thread."
},
{
"name": "Models",
"description": "These endpoints provide a list and descriptions of all available models within the Cortex framework."
"name": "Pulling Models",
"description": "These endpoints handle downloading and importing models."
},
{
"name": "Running Models",
"description": "These endpoints support a range of operations that allow users to effectively control and interact with their models"
},
{
"name": "Server",
Expand Down Expand Up @@ -1399,7 +1403,8 @@
"Chat",
"Engines",
"Events",
"Models",
"Pulling Models",
"Running Models",
"Processes",
"Status",
"Server"
Expand Down Expand Up @@ -3235,8 +3240,60 @@
"type": "string",
"example": "llama3:8b-gguf-q6-k",
"description": "A downloaded model name."
},
"ctx_len": {
"type": "number",
"description": "The context length for model operations varies; the maximum depends on the specific model used.",
"example": 4096
},
"ngl": {
"type": "number",
"description": "Determines GPU layer usage.",
"example": 32
},
"n_parallel": {
"type": "number",
"minimum": 1,
"description": "Number of parallel processing units to use.",
"example": 1
},
"cache_type": {
"type": "string",
"description": "KV cache type: f16, q8_0, q4_0, default is f16",
"example": "f16"
},
"caching_enabled": {
"type": "boolean",
"description": "To enable prompt caching or not",
"example": true
},
"model_path": {
"type": "string",
"description": "Local path to LLM model file",
"example": "/tmp/model.gguf"
},
"mmproj": {
"type": "string",
"description": "Local path to mmproj model file",
"example": "/tmp/model.gguf"
}
}
},
"required": [
"model"
]
},
"ModelStopDto": {
"type": "object",
"properties": {
"model": {
"type": "string",
"example": "llama3:8b-gguf-q6-k",
"description": "A downloaded model name."
}
},
"required": [
"model"
]
},
"ImportModelRequest": {
"type": "object",
Expand All @@ -3256,7 +3313,10 @@
"option": {
"type": "string",
"description": "Import options such as symlink or copy.",
"enum": ["symlink", "copy"]
"enum": [
"symlink",
"copy"
]
}
},
"required": [
Expand Down Expand Up @@ -3551,7 +3611,161 @@
},
"UpdateModelDto": {
"type": "object",
"properties": {}
"properties": {
"files": {
"type": "array",
"description": "List of file paths associated with the model. Can be relative or absolute.",
"items": {
"type": "string",
"example": "models\\cortex.so\\tinyllama\\1b-gguf\\model.gguf"
}
},
"stop": {
"type": "array",
"description": "Tokens that signal the end of generation.",
"items": {
"type": "string"
},
"example": [
"</s>"
]
},
"stream": {
"type": "boolean",
"description": "Whether to stream the output as it is generated.",
"example": true
},
"top_p": {
"type": "number",
"description": "Controls nucleus sampling; the model considers the results of the tokens with top_p probability mass.",
"example": 0.95
},
"temperature": {
"type": "number",
"description": "Controls randomness in token selection; lower values make the output more deterministic.",
"example": 0.7
},
"frequency_penalty": {
"type": "number",
"description": "Penalizes repeated tokens based on their frequency.",
"example": 0
},
"presence_penalty": {
"type": "number",
"description": "Penalizes tokens that have already appeared in the output.",
"example": 0
},
"max_tokens": {
"type": "integer",
"description": "Maximum number of tokens to generate.",
"example": 4096
},
"seed": {
"type": "integer",
"description": "Seed for random number generation to ensure reproducibility; -1 for random seed.",
"example": -1
},
"dynatemp_range": {
"type": "number",
"description": "Range for dynamic temperature adjustment.",
"example": 0
},
"dynatemp_exponent": {
"type": "number",
"description": "Exponent for dynamic temperature adjustment.",
"example": 1
},
"top_k": {
"type": "integer",
"description": "Limits the sampling pool to the top_k most probable tokens.",
"example": 40
},
"min_p": {
"type": "number",
"description": "Minimum probability threshold for token selection.",
"example": 0.05
},
"tfs_z": {
"type": "number",
"description": "Threshold for token frequency sampling.",
"example": 1
},
"typ_p": {
"type": "number",
"description": "Controls typical sampling; similar to top_p but focuses on local token distribution.",
"example": 1
},
"repeat_last_n": {
"type": "integer",
"description": "Number of recent tokens to consider for repetition penalty.",
"example": 64
},
"repeat_penalty": {
"type": "number",
"description": "Penalty applied to repeated tokens.",
"example": 1
},
"mirostat": {
"type": "boolean",
"description": "Enables or disables Mirostat sampling.",
"example": false
},
"mirostat_tau": {
"type": "number",
"description": "Target entropy for Mirostat sampling.",
"example": 5
},
"mirostat_eta": {
"type": "number",
"description": "Learning rate for Mirostat sampling.",
"example": 0.1
},
"penalize_nl": {
"type": "boolean",
"description": "Whether to penalize newline tokens.",
"example": false
},
"ignore_eos": {
"type": "boolean",
"description": "Whether to ignore end-of-sequence tokens during generation.",
"example": false
},
"n_probs": {
"type": "integer",
"description": "Number of probabilities to consider for each token.",
"example": 0
},
"min_keep": {
"type": "integer",
"description": "Minimum number of tokens to keep in the buffer.",
"example": 0
},
"engine": {
"type": "string",
"description": "The engine used to run the model.",
"example": "llama-cpp"
},
"prompt_template": {
"type": "string",
"description": "Template used for formatting prompts.",
"example": "\n\n<|system|>\n{system_message}</s>\n\n\n\n\n<|user|>\n{prompt}</s>\n\n\n<|assistant|>\n\n"
},
"ctx_len": {
"type": "integer",
"description": "Context length for the model.",
"example": 4096
},
"n_parallel": {
"type": "integer",
"description": "Number of parallel threads for execution.",
"example": 1
},
"ngl": {
"type": "integer",
"description": "Number of GPU layers.",
"example": 33
}
}
},
"DeleteModelResponseDto": {
"type": "object",
Expand Down

0 comments on commit 5ca8257

Please sign in to comment.