Merge pull request #484 from rhatdan/run

Allow users to set ctx-size via command line
containers · Nov 22, 2024 · 590fe3e · 590fe3e
2 parents 9e0d976 + 0719c16
commit 590fe3e
Show file tree

Hide file tree

Showing 8 changed files with 119 additions and 18 deletions.
diff --git a/docs/ramalama-run.1.md b/docs/ramalama-run.1.md
@@ -11,12 +11,25 @@ ramalama\-run - run specified AI Model as a chatbot
 #### **--authfile**=*password*
 path of the authentication file for OCI registries
 
+#### **--ctx-size**, **-c**
+size of the prompt context (default: 2048, 0 = loaded from model)
+
 #### **--help**, **-h**
 show this help message and exit
 
 #### **--name**, **-n**
 name of the container to run the Model in
 
+#### **--temp**="0.8"
+Temperature of the response from the AI Model
+llama.cpp explains this as:
+
+    The lower the number is, the more deterministic the response will be.
+
+    The higher the number is the more creative the response will be, but moe likely to go off topic if set too high.
+
+        Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
+
 #### **--tls-verify**=*true*
 require HTTPS and verify certificates when contacting OCI registries
 

diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md
@@ -20,6 +20,9 @@ For REST API endpoint documentation, see: [https://github.com/ggerganov/llama.cp
 #### **--authfile**=*password*
 path of the authentication file for OCI registries
 
+#### **--ctx-size**, **-c**
+size of the prompt context (default: 2048, 0 = loaded from model)
+
 #### **--detach**, **-d**
 Run the container in the background and print the new container ID.
 The default is TRUE. The --nocontainer option forces this option to False.
@@ -47,6 +50,16 @@ Name of the container to run the Model in.
 #### **--port**, **-p**
 port for AI Model server to listen on
 
+#### **--temp**="0.8"
+Temperature of the response from the AI Model
+llama.cpp explains this as:
+
+    The lower the number is, the more deterministic the response will be.
+
+    The higher the number is the more creative the response will be, but moe likely to go off topic if set too high.
+
+        Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
+
 #### **--tls-verify**=*true*
 require HTTPS and verify certificates when contacting OCI registries
 

diff --git a/docs/ramalama.conf b/docs/ramalama.conf
@@ -17,14 +17,19 @@
 
 [ramalama]
 
+# OCI model car image
+# Image to use when building and pushing --type=car models
+#
+#carimage = "registry.access.redhat.com/ubi9-micro:latest"
+
 # Run RamaLama in the default container.
 #
 #container = true
 
-# OCI Model Car image
-# Image to use when building and pushing --type=car models
+#size of the prompt context (0 = loaded from model)
 #
-#carimage = "registry.access.redhat.com/ubi9-micro:latest"
+#ctx_size=2048
+
 
 # Run RamaLama using the specified container engine.
 #
@@ -52,6 +57,16 @@
 #
 #store = "$HOME/.local/share/ramalama"
 
+# Temperature of the response from the AI Model
+# llama.cpp explains this as:
+#
+#    The lower the number is, the more deterministic the response will be.
+#
+#    The higher the number is the more creative the response will be, but moe likely to go off topic if set too high.
+#
+#        Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
+#temp=0.8
+
 # Specify the default transport to be used for pulling and pushing of AI Models.
 # Options: oci, ollama, huggingface.
 #

diff --git a/docs/ramalama.conf.5.md b/docs/ramalama.conf.5.md
@@ -56,16 +56,20 @@ The ramalama table contains settings to configure and manage the OCI runtime.
 
 [ramalama]
 
+**carimage**="registry.access.redhat.com/ubi9-micro:latest"
+
+OCI model car image
+
+Image to be used when building and pushing --type=car models
+
 **container**=true
 
 Run RamaLama in the default container.
 RAMALAMA_IN_CONTAINER environment variable overrides this field.
 
-**carimage**="registry.access.redhat.com/ubi9-micro:latest"
+**ctx_size**=2048
 
-OCI Model Car image
-
-Image to be used when building and pushing --type=car models
+Size of the prompt context (0 = loaded from model)
 
 **engine**="podman"
 
@@ -95,9 +99,18 @@ Options: llama.cpp, vllm
 
 Store AI Models in the specified directory
 
+**temp**="0.8"
+Temperature of the response from the AI Model
+llama.cpp explains this as:
+
+    The lower the number is, the more deterministic the response will be.
+
+    The higher the number is the more creative the response will be, but moe likely to go off topic if set too high.
+
+        Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
+
 **transport**="ollama"
 
 Specify the default transport to be used for pulling and pushing of AI Models.
 Options: oci, ollama, huggingface.
 RAMALAMA_TRANSPORT environment variable overrides this field.
-
diff --git a/ramalama/cli.py b/ramalama/cli.py
@@ -617,7 +617,17 @@ def push_cli(args):
 def run_parser(subparsers):
     parser = subparsers.add_parser("run", help="run specified AI Model as a chatbot")
     parser.add_argument("--authfile", help="path of the authentication file")
+    parser.add_argument(
+        "-c",
+        "--ctx-size",
+        dest="context",
+        default=config.get('ctx_size', 2048),
+        help="size of the prompt context (0 = loaded from model)",
+    )
     parser.add_argument("-n", "--name", dest="name", help="name of container in which the Model will be run")
+    parser.add_argument(
+        "--temp", default=config.get('temp', "0.8"), help="temperature of the response from the AI model"
+    )
     parser.add_argument(
         "--tls-verify",
         dest="tlsverify",
@@ -639,12 +649,22 @@ def run_cli(args):
 def serve_parser(subparsers):
     parser = subparsers.add_parser("serve", help="serve REST API on specified AI Model")
     parser.add_argument("--authfile", help="path of the authentication file")
+    parser.add_argument(
+        "-c",
+        "--ctx-size",
+        dest="context",
+        default=config.get('ctx_size', 2048),
+        help="size of the prompt context (0 = loaded from model)",
+    )
     parser.add_argument("-d", "--detach", action="store_true", dest="detach", help="run the container in detached mode")
     parser.add_argument("--host", default=config.get('host', "0.0.0.0"), help="IP address to listen")
     parser.add_argument("-n", "--name", dest="name", help="name of container in which the Model will be run")
     parser.add_argument(
         "-p", "--port", default=config.get('port', "8080"), help="port for AI Model server to listen on"
     )
+    parser.add_argument(
+        "--temp", default=config.get('temp', "0.8"), help="temperature of the response from the AI model"
+    )
     parser.add_argument(
         "--tls-verify",
         dest="tlsverify",
@@ -746,7 +766,7 @@ def _rm_model(models, args):
                     if not args.ignore:
                         raise e
             try:
-                # attempt to remove as a container image 
+                # attempt to remove as a container image
                 m = OCI(model, config.get('engine', container_manager()))
                 m.remove(args, ignore_stderr=True)
                 return

diff --git a/ramalama/model.py b/ramalama/model.py
@@ -39,7 +39,6 @@ class Model:
 
     model = ""
     type = "Model"
-    common_params = ["-c", "2048"]
 
     def __init__(self, model):
         self.model = model
@@ -257,15 +256,26 @@ def run(self, args):
         if not args.container:
             exec_model_path = model_path
 
-        exec_args = ["llama-cli", "-m", exec_model_path, "--in-prefix", "", "--in-suffix", ""]
+        exec_args = [
+            "llama-cli",
+            "-m",
+            exec_model_path,
+            "--in-prefix",
+            "",
+            "--in-suffix",
+            "",
+            "-c",
+            f"{args.context}",
+            "--temp",
+            f"{args.temp}",
+        ]
 
         if not args.debug:
             exec_args += ["--no-display-prompt"]
-
         exec_args += [
             "-p",
             prompt,
-        ] + self.common_params
+        ]
 
         if not args.ARGS and sys.stdin.isatty():
             exec_args.append("-cnv")
@@ -301,7 +311,16 @@ def serve(self, args):
         if not args.container and not args.generate:
             exec_model_path = model_path
 
-        exec_args = ["llama-server", "--port", args.port, "-m", exec_model_path]
+        exec_args = ["llama-server",
+                     "--port",
+                     args.port,
+                     "-m",
+                     exec_model_path,
+                     "-c",
+                     f"{args.context}",
+                     "--temp",
+                     f"{args.temp}",
+                     ]
         if args.runtime == "vllm":
             if not (exec_model_path.endswith(".GGUF") or exec_model_path.endswith(".gguf")):
                 exec_model_path = os.path.dirname(exec_model_path)

diff --git a/test/system/030-run.bats b/test/system/030-run.bats
@@ -14,10 +14,13 @@ load helpers
 	run_ramalama --dryrun run ${model}
 	is "$output" "${verify_begin} ramalama_.*" "dryrun correct"
 	is "$output" ".*${model}" "verify model name"
+	is "$output" ".*-c 2048" "verify model name"
 
-	run_ramalama --dryrun run --name foobar ${model}
+	run_ramalama --dryrun run -c 4096 --name foobar ${model}
 	is "$output" "${verify_begin} foobar .*" "dryrun correct with --name"
 	is "$output" ".*${model}" "verify model name"
+	is "$output" ".*-c 4096" "verify ctx-size is set"
+	is "$output" ".*--temp 0.8" "verify temp is set"
 
 	run_ramalama --dryrun run --name foobar ${model}
 	is "$output" "${verify_begin} foobar .*" "dryrun correct with --name"
@@ -28,10 +31,11 @@ load helpers
 	RAMALAMA_IMAGE=${image} run_ramalama --dryrun run ${model}
 	is "$output" ".*${image} /bin/sh -c" "verify image name"
     else
-	run_ramalama --dryrun run ${model}
-	is "$output" 'llama-cli -m /path/to/model --in-prefix --in-suffix --no-display-prompt -p.*' "dryrun correct"
+	run_ramalama --dryrun run -c 4096 ${model}
+	is "$output" 'llama-cli -m /path/to/model --in-prefix --in-suffix -c 4096 --temp 0.8 --no-display-prompt -p.*' "dryrun correct"
+	is "$output" ".*-c 4096" "verify model name"
 
-	run_ramalama 1 run --name foobar tiny
+	run_ramalama 1 run --ctx-size=4096 --name foobar tiny
 	is "${lines[0]}"  "Error: --nocontainer and --name options conflict. --name requires a container." "conflict between nocontainer and --name line"
     fi
 }

diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats
@@ -22,6 +22,10 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable --name
 	run_ramalama --dryrun serve --host 127.1.2.3 --name foobar ${model}
 	assert "$output" =~ ".*--host 127.1.2.3" "verify --host is modified when run within container"
 	is "$output" ".*${model}" "verify model name"
+	is "$output" ".*--temp 0.8" "verify temp is set"
+
+	run_ramalama --dryrun serve --temp 0.1 ${model}
+	is "$output" ".*--temp 0.1" "verify temp is set"
 
 	run_ramalama 1 --nocontainer serve --name foobar tiny
 	is "${lines[0]}"  "Error: --nocontainer and --name options conflict. --name requires a container." "conflict between nocontainer and --name line"