Merge pull request #1625 from janhq/chore/models-api

gabrielle-ong · web-flow · commit 5ca8257703a8 · 2024-11-05T11:27:03.000+08:00
chore: Running Models API reference
diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
@@ -156,7 +156,7 @@
       "get": {
         "operationId": "HealthController_check",
         "summary": "Check health",
-        "description": "Checks the health of the application.",
+        "description": "Performs a comprehensive check of the application's health status.",
         "parameters": [],
         "responses": {
           "200": {
@@ -175,7 +175,7 @@
       "delete": {
         "operationId": "Terminate server process",
         "summary": "Terminate server",
-        "description": "Terminate server process.",
+        "description": "Initiates the shutdown process for the server, ensuring that all active connections are gracefully closed and any ongoing processes are properly terminated.",
         "parameters": [],
         "responses": {
           "200": {
@@ -329,12 +329,12 @@
           }
         },
         "tags": [
-          "Models"
+          "Pulling Models"
         ]
       },
       "delete": {
         "tags": [
-          "Models"
+          "Pulling Models"
         ],
         "summary": "Stop model download",
         "description": "Stops the download of a model with the corresponding taskId provided in the request body",
@@ -449,15 +449,15 @@
           }
         },
         "tags": [
-          "Models"
+          "Running Models"
         ]
       }
     },
     "/v1/models/start": {
       "post": {
         "operationId": "ModelsController_startModel",
         "summary": "Start model",
-        "description": "Load a model into memory.",
+        "description": "Load a model into memory. Note: Request body parameters will override those loaded from model.yml",
         "requestBody": {
           "required": true,
           "content": {
@@ -484,7 +484,7 @@
           }
         },
         "tags": [
-          "Models"
+          "Running Models"
         ]
       }
     },
@@ -498,7 +498,7 @@
           "content": {
             "application/json": {
               "schema": {
-                "$ref": "#/components/schemas/ModelStartDto"
+                "$ref": "#/components/schemas/ModelStopDto"
               },
               "example": {
                 "model": "llama3:8b-gguf-q6-k"
@@ -519,7 +519,7 @@
           }
         },
         "tags": [
-          "Models"
+          "Running Models"
         ]
       }
     },
@@ -552,7 +552,7 @@
           }
         },
         "tags": [
-          "Models"
+          "Running Models"
         ]
       },
       "delete": {
@@ -583,7 +583,7 @@
           }
         },
         "tags": [
-          "Models"
+          "Running Models"
         ]
       }
     },
@@ -625,7 +625,7 @@
           }
         },
         "tags": [
-          "Models"
+          "Running Models"
         ]
       }
     },
@@ -668,7 +668,7 @@
           }
         },
         "tags": [
-          "Models"
+          "Pulling Models"
         ]
       }
     },
@@ -1368,8 +1368,12 @@
       "description": "These endpoints manage the lifecycle of an Assistant within a conversation thread."
     },
     {
-      "name": "Models",
-      "description": "These endpoints provide a list and descriptions of all available models within the Cortex framework."
+      "name": "Pulling Models",
+      "description": "These endpoints handle downloading and importing models."
+    },
+    {
+      "name": "Running Models",
+      "description": "These endpoints support a range of operations that allow users to effectively control and interact with their models"
     },
     {
       "name": "Server",
@@ -1399,7 +1403,8 @@
         "Chat",
         "Engines",
         "Events",
-        "Models",
+        "Pulling Models",
+        "Running Models",
         "Processes",
         "Status",
         "Server"
@@ -3235,8 +3240,60 @@
             "type": "string",
             "example": "llama3:8b-gguf-q6-k",
             "description": "A downloaded model name."
+          },
+          "ctx_len": {
+            "type": "number",
+            "description": "The context length for model operations varies; the maximum depends on the specific model used.",
+            "example": 4096
+          },
+          "ngl": {
+            "type": "number",
+            "description": "Determines GPU layer usage.",
+            "example": 32
+          },
+          "n_parallel": {
+            "type": "number",
+            "minimum": 1,
+            "description": "Number of parallel processing units to use.",
+            "example": 1
+          },
+          "cache_type": {
+            "type": "string",
+            "description": "KV cache type: f16, q8_0, q4_0, default is f16",
+            "example": "f16"
+          },
+          "caching_enabled": {
+            "type": "boolean",
+            "description": "To enable prompt caching or not",
+            "example": true
+          },
+          "model_path": {
+            "type": "string",
+            "description": "Local path to LLM model file",
+            "example": "/tmp/model.gguf"
+          },
+          "mmproj": {
+            "type": "string",
+            "description": "Local path to mmproj model file",
+            "example": "/tmp/model.gguf"
           }
-        }
+        },
+        "required": [
+          "model"
+        ]
+      },
+      "ModelStopDto": {
+        "type": "object",
+        "properties": {
+          "model": {
+            "type": "string",
+            "example": "llama3:8b-gguf-q6-k",
+            "description": "A downloaded model name."
+          }
+        },
+        "required": [
+          "model"
+        ]
       },
       "ImportModelRequest": {
         "type": "object",
@@ -3256,7 +3313,10 @@
           "option": {
             "type": "string",
             "description": "Import options such as symlink or copy.",
-            "enum": ["symlink", "copy"]
+            "enum": [
+              "symlink",
+              "copy"
+            ]
           }
         },
         "required": [
@@ -3551,7 +3611,161 @@
       },
       "UpdateModelDto": {
         "type": "object",
-        "properties": {}
+        "properties": {
+          "files": {
+            "type": "array",
+            "description": "List of file paths associated with the model. Can be relative or absolute.",
+            "items": {
+              "type": "string",
+              "example": "models\\cortex.so\\tinyllama\\1b-gguf\\model.gguf"
+            }
+          },
+          "stop": {
+            "type": "array",
+            "description": "Tokens that signal the end of generation.",
+            "items": {
+              "type": "string"
+            },
+            "example": [
+              "</s>"
+            ]
+          },
+          "stream": {
+            "type": "boolean",
+            "description": "Whether to stream the output as it is generated.",
+            "example": true
+          },
+          "top_p": {
+            "type": "number",
+            "description": "Controls nucleus sampling; the model considers the results of the tokens with top_p probability mass.",
+            "example": 0.95
+          },
+          "temperature": {
+            "type": "number",
+            "description": "Controls randomness in token selection; lower values make the output more deterministic.",
+            "example": 0.7
+          },
+          "frequency_penalty": {
+            "type": "number",
+            "description": "Penalizes repeated tokens based on their frequency.",
+            "example": 0
+          },
+          "presence_penalty": {
+            "type": "number",
+            "description": "Penalizes tokens that have already appeared in the output.",
+            "example": 0
+          },
+          "max_tokens": {
+            "type": "integer",
+            "description": "Maximum number of tokens to generate.",
+            "example": 4096
+          },
+          "seed": {
+            "type": "integer",
+            "description": "Seed for random number generation to ensure reproducibility; -1 for random seed.",
+            "example": -1
+          },
+          "dynatemp_range": {
+            "type": "number",
+            "description": "Range for dynamic temperature adjustment.",
+            "example": 0
+          },
+          "dynatemp_exponent": {
+            "type": "number",
+            "description": "Exponent for dynamic temperature adjustment.",
+            "example": 1
+          },
+          "top_k": {
+            "type": "integer",
+            "description": "Limits the sampling pool to the top_k most probable tokens.",
+            "example": 40
+          },
+          "min_p": {
+            "type": "number",
+            "description": "Minimum probability threshold for token selection.",
+            "example": 0.05
+          },
+          "tfs_z": {
+            "type": "number",
+            "description": "Threshold for token frequency sampling.",
+            "example": 1
+          },
+          "typ_p": {
+            "type": "number",
+            "description": "Controls typical sampling; similar to top_p but focuses on local token distribution.",
+            "example": 1
+          },
+          "repeat_last_n": {
+            "type": "integer",
+            "description": "Number of recent tokens to consider for repetition penalty.",
+            "example": 64
+          },
+          "repeat_penalty": {
+            "type": "number",
+            "description": "Penalty applied to repeated tokens.",
+            "example": 1
+          },
+          "mirostat": {
+            "type": "boolean",
+            "description": "Enables or disables Mirostat sampling.",
+            "example": false
+          },
+          "mirostat_tau": {
+            "type": "number",
+            "description": "Target entropy for Mirostat sampling.",
+            "example": 5
+          },
+          "mirostat_eta": {
+            "type": "number",
+            "description": "Learning rate for Mirostat sampling.",
+            "example": 0.1
+          },
+          "penalize_nl": {
+            "type": "boolean",
+            "description": "Whether to penalize newline tokens.",
+            "example": false
+          },
+          "ignore_eos": {
+            "type": "boolean",
+            "description": "Whether to ignore end-of-sequence tokens during generation.",
+            "example": false
+          },
+          "n_probs": {
+            "type": "integer",
+            "description": "Number of probabilities to consider for each token.",
+            "example": 0
+          },
+          "min_keep": {
+            "type": "integer",
+            "description": "Minimum number of tokens to keep in the buffer.",
+            "example": 0
+          },
+          "engine": {
+            "type": "string",
+            "description": "The engine used to run the model.",
+            "example": "llama-cpp"
+          },
+          "prompt_template": {
+            "type": "string",
+            "description": "Template used for formatting prompts.",
+            "example": "\n\n<|system|>\n{system_message}</s>\n\n\n\n\n<|user|>\n{prompt}</s>\n\n\n<|assistant|>\n\n"
+          },
+          "ctx_len": {
+            "type": "integer",
+            "description": "Context length for the model.",
+            "example": 4096
+          },
+          "n_parallel": {
+            "type": "integer",
+            "description": "Number of parallel threads for execution.",
+            "example": 1
+          },
+          "ngl": {
+            "type": "integer",
+            "description": "Number of GPU layers.",
+            "example": 33
+          }
+        }
       },
       "DeleteModelResponseDto": {
         "type": "object",