From a8bc86210eec83b03dda31b74e15ddd02c1e2978 Mon Sep 17 00:00:00 2001
From: Sam <sammcj@users.noreply.github.com>
Date: Sun, 15 Sep 2024 08:34:26 +1000
Subject: [PATCH] feat(docker,auth): Add Docker, Compose and optional auth

---
 Dockerfile          |  47 ++++++++++++++
 README.md           | 119 +++++++++++++++++++++++++++--------
 docker-compose.yaml |  38 +++++++++++
 optillm.py          | 150 +++++++++++++++++++++++++++++++-------------
 4 files changed, 285 insertions(+), 69 deletions(-)
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.yaml

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..5b22f47
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,47 @@
+# Build stage
+FROM python:3.12-slim AS builder
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc libc6-dev \
+  && rm -rf /var/lib/apt/lists/*
+
+# Copy only the requirements file first to leverage Docker cache
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Final stage
+FROM python:3.12-slim
+
+# Install curl for the healthcheck
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  curl && \
+  apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Copy installed dependencies from builder stage
+COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+
+# Copy application code
+COPY . .
+
+# Create a non-root user and switch to it
+RUN useradd -m appuser
+USER appuser
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+
+# Expose the port the app runs on
+EXPOSE 8000
+
+# Run the application
+ENTRYPOINT ["python", "optillm.py"]
diff --git a/README.md b/README.md
index f20de50..c03459d 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,16 @@ optillm is an OpenAI API compatible optimizing inference proxy which implements
 
 ### plansearch-gpt-4o-mini on LiveCodeBench (Sep 2024)
 
-| Model | pass@1 | pass@5 | pass@10 |
-|-------|--------|--------|---------|
-| plansearch-gpt-4o-mini | 44.03 | 59.31 | 63.5 |
-| gpt-4o-mini | 43.9 | 50.61 | 53.25 |
-| claude-3.5-sonnet | 51.3 | | |
-| gpt-4o-2024-05-13 | 45.2 | | |
-| gpt-4-turbo-2024-04-09 | 44.2 | | |
+| Model                  | pass@1 | pass@5 | pass@10 |
+| ---------------------- | ------ | ------ | ------- |
+| plansearch-gpt-4o-mini | 44.03  | 59.31  | 63.5    |
+| gpt-4o-mini            | 43.9   | 50.61  | 53.25   |
+| claude-3.5-sonnet      | 51.3   |        |         |
+| gpt-4o-2024-05-13      | 45.2   |        |         |
+| gpt-4-turbo-2024-04-09 | 44.2   |        |         |
 
 ### moa-gpt-4o-mini on Arena-Hard-Auto (Aug 2024)
+
 ![Results showing Mixture of Agents approach using gpt-4o-mini on Arena Hard Auto Benchmark](./moa-results.png)
 
 ## Installation
@@ -32,7 +33,7 @@ pip install -r requirements.txt
 You can then run the optillm proxy as follows.
 
 ```bash
-python optillm.py                           
+python optillm.py
 2024-09-06 07:57:14,191 - INFO - Starting server with approach: auto
 2024-09-06 07:57:14,191 - INFO - Server configuration: {'approach': 'auto', 'mcts_simulations': 2, 'mcts_exploration': 0.2, 'mcts_depth': 1, 'best_of_n': 3, 'model': 'gpt-4o-mini', 'rstar_max_depth': 3, 'rstar_num_rollouts': 5, 'rstar_c': 1.4, 'base_url': ''}
  * Serving Flask app 'optillm'
@@ -44,11 +45,11 @@ python optillm.py
 2024-09-06 07:57:14,212 - INFO - Press CTRL+C to quit
 ```
 
-### Usage
+## Usage
 
-Once the proxy is running, you can just use it as a drop in replacement for an OpenAI client by setting the `base_url` as `http://localhost:8000/v1`.
+Once the proxy is running, you can use it as a drop in replacement for an OpenAI client by setting the `base_url` as `http://localhost:8000/v1`.
 
-```bash
+```python
 import os
 from openai import OpenAI
 
@@ -70,7 +71,7 @@ response = client.chat.completions.create(
 print(response)
 ```
 
-You can control the technique you use for optimization by prepending the slug to the model name `{slug}-model-name`. E.g. in the above code we are using `moa` or 
+You can control the technique you use for optimization by prepending the slug to the model name `{slug}-model-name`. E.g. in the above code we are using `moa` or
 mixture of agents as the optimization approach. In the proxy logs you will see the following showing the `moa` is been used with the base model as `gpt-4o-mini`.
 
 ```bash
@@ -83,20 +84,86 @@ mixture of agents as the optimization approach. In the proxy logs you will see t
 
 ## Implemented techniques
 
-| Technique | Slug | Description |
-|-----------|----------------|-------------|
-| Agent | `agent ` | Determines which of the below approaches to take and then combines the results |
-| Monte Carlo Tree Search | `mcts` | Uses MCTS for decision-making in chat responses |
-| Best of N Sampling | `bon` | Generates multiple responses and selects the best one |
-| Mixture of Agents | `moa` | Combines responses from multiple critiques |
-| Round Trip Optimization | `rto` | Optimizes responses through a round-trip process |
-| Z3 Solver | `z3` | Utilizes the Z3 theorem prover for logical reasoning |
-| Self-Consistency | `self_consistency` | Implements an advanced self-consistency method |
-| PV Game | `pvg` | Applies a prover-verifier game approach at inference time |
-| R* Algorithm | `rstar` | Implements the R* algorithm for problem-solving |
-| CoT with Reflection | `cot_reflection` | Implements chain-of-thought reasoning with \<thinking\>, \<reflection> and \<output\> sections | 
-| PlanSearch | `plansearch` | Implements a search algorithm over candidate plans for solving a problem in natural language |
-| LEAP | `leap` | Learns task-specific principles from few shot examples |
+| Technique               | Slug               | Description                                                                                    |
+| ----------------------- | ------------------ | ---------------------------------------------------------------------------------------------- |
+| Agent                   | `agent`            | Determines which of the below approaches to take and then combines the results                 |
+| Monte Carlo Tree Search | `mcts`             | Uses MCTS for decision-making in chat responses                                                |
+| Best of N Sampling      | `bon`              | Generates multiple responses and selects the best one                                          |
+| Mixture of Agents       | `moa`              | Combines responses from multiple critiques                                                     |
+| Round Trip Optimization | `rto`              | Optimizes responses through a round-trip process                                               |
+| Z3 Solver               | `z3`               | Utilizes the Z3 theorem prover for logical reasoning                                           |
+| Self-Consistency        | `self_consistency` | Implements an advanced self-consistency method                                                 |
+| PV Game                 | `pvg`              | Applies a prover-verifier game approach at inference time                                      |
+| R* Algorithm            | `rstar`            | Implements the R* algorithm for problem-solving                                                |
+| CoT with Reflection     | `cot_reflection`   | Implements chain-of-thought reasoning with \<thinking\>, \<reflection> and \<output\> sections |
+| PlanSearch              | `plansearch`       | Implements a search algorithm over candidate plans for solving a problem in natural language   |
+| LEAP                    | `leap`             | Learns task-specific principles from few shot examples                                         |
+
+## Available Parameters
+
+optillm supports various command-line arguments and environment variables for configuration.
+
+| Parameter                | Description                                                     | Default Value   |
+|--------------------------|-----------------------------------------------------------------|-----------------|
+| `--approach`             | Inference approach to use                                       | `"auto"`        |
+| `--simulations`          | Number of MCTS simulations                                      | 2               |
+| `--exploration`          | Exploration weight for MCTS                                     | 0.2             |
+| `--depth`                | Simulation depth for MCTS                                       | 1               |
+| `--best-of-n`            | Number of samples for best_of_n approach                        | 3               |
+| `--model`                | OpenAI model to use                                             | `"gpt-4o-mini"` |
+| `--base-url`             | Base URL for OpenAI compatible endpoint                         | `""`            |
+| `--rstar-max-depth`      | Maximum depth for rStar algorithm                               | 3               |
+| `--rstar-num-rollouts`   | Number of rollouts for rStar algorithm                          | 5               |
+| `--rstar-c`              | Exploration constant for rStar algorithm                        | 1.4             |
+| `--n`                    | Number of final responses to be returned                        | 1               |
+| `--return-full-response` | Return the full response including the CoT with <thinking> tags | `False`         |
+| `--port`                 | Specify the port to run the proxy                               | 8000            |
+| `--api-key`              | Optional API key for client authentication to optillm           | `""`            |
+
+When using Docker, these can be set as environment variables prefixed with `OPTILLM_`.
+
+## Running with Docker
+
+optillm can optionally be built and run using Docker and the provided [Dockerfile](./Dockerfile).
+
+### Using Docker Compose
+
+1. Make sure you have Docker and Docker Compose installed on your system.
+
+2. Either update the environment variables in the docker-compose.yaml file or create a `.env` file in the project root directory and add any environment variables you want to set. For example, to set the OpenAI API key, add the following line to the `.env` file:
+
+   ```bash
+   OPENAI_API_KEY=your_openai_api_key_here
+   ```
+
+3. Run the following command to start optillm:
+
+   ```bash
+   docker compose up -d
+   ```
+
+   This will build the Docker image if it doesn't exist and start the optillm service.
+
+4. optillm will be available at `http://localhost:8000`.
+
+When using Docker, you can set these parameters as environment variables. For example, to set the approach and model, you would use:
+
+```bash
+OPTILLM_APPROACH=mcts
+OPTILLM_MODEL=gpt-4
+```
+
+To secure the optillm proxy with an API key, set the `OPTILLM_API_KEY` environment variable:
+
+```bash
+OPTILLM_API_KEY=your_secret_api_key
+```
+
+When the API key is set, clients must include it in their requests using the `Authorization` header:
+
+```plain
+Authorization: Bearer your_secret_api_key
+```
 
 ## References
 
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 0000000..8e983ba
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,38 @@
+services:
+  &name optillm:
+    build:
+      context: https://github.com/codelion/optillm.git#main
+      # context: .
+      dockerfile: Dockerfile
+      tags:
+        - optillm:latest
+    image: optillm:latest
+    container_name: *name
+    hostname: *name
+    ports:
+      - "8000:8000"
+    environment:
+      OPENAI_API_KEY: ${OPENAI_API_KEY:-""}
+      OPTILLM_BASE_URL: ${OPENAI_BASE_URL:-"https://api.openai.com/v1"}
+      # OPTILLM_API_KEY: ${OPTILLM_API_KEY:-} # optionally sets an API key for Optillm clients
+      # Uncomment and set values for other arguments (prefixed with OPTILLM_) as needed, e.g.:
+      # OPTILLM_APPROACH: auto
+      # OPTILLM_MODEL: gpt-4o-mini
+      # OPTILLM_SIMULATIONS: 2
+      # OPTILLM_EXPLORATION: 0.2
+      # OPTILLM_DEPTH: 1
+      # OPTILLM_BEST_OF_N: 3
+      # OPTILLM_RSTAR_MAX_DEPTH: 3
+      # OPTILLM_RSTAR_NUM_ROLLOUTS: 5
+      # OPTILLM_RSTAR_C: 1.4
+      # OPTILLM_N: 1
+      # OPTILLM_RETURN_FULL_RESPONSE: false
+      # OPTILLM_PORT: 8000
+    restart: on-failure
+    stop_grace_period: 2s
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://127.0.0.1:8000/health"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 3s
diff --git a/optillm.py b/optillm.py
index 6fcd716..f6d8134 100644
--- a/optillm.py
+++ b/optillm.py
@@ -1,6 +1,7 @@
 import argparse
 import logging
 import os
+import secrets
 from flask import Flask, request, jsonify
 from openai import OpenAI
 
@@ -41,21 +42,41 @@
     'rstar_num_rollouts': 5,
     'rstar_c': 1.4,
     'n': 1,
+    'base_url': '',
+    'api_key': '',
+    'return_full_response': False,
+    'port': 8000,
 }
 
+# List of known approaches
+known_approaches = ["mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar",
+                    "cot_reflection", "plansearch", "leap", "agent"]
+
+# Optional API key configuration to secure the proxy
+@app.before_request
+def check_api_key():
+    if server_config['api_key']:
+        auth_header = request.headers.get('Authorization')
+        if not auth_header or not auth_header.startswith('Bearer '):
+            return jsonify({"error": "Invalid Authorization header. Expected format: 'Authorization: Bearer YOUR_API_KEY'"}), 401
+
+        client_key = auth_header.split('Bearer ', 1)[1].strip()
+        if not secrets.compare_digest(client_key, server_config['api_key']):
+            return jsonify({"error": "Invalid API key"}), 401
+
 @app.route('/v1/chat/completions', methods=['POST'])
 def proxy():
     logger.info('Received request to /v1/chat/completions')
     data = request.get_json()
     logger.debug(f'Request data: {data}')
-    
+
     messages = data.get('messages', [])
     model = data.get('model', server_config['model'])
     n = data.get('n', server_config['n'])
-    
+
     system_prompt = next((msg['content'] for msg in messages if msg['role'] == 'system'), "")
     initial_query = next((msg['content'] for msg in messages if msg['role'] == 'user'), "")
-    
+
     approach = server_config['approach']
     base_url = server_config['base_url']
 
@@ -63,14 +84,21 @@ def proxy():
         client = OpenAI(api_key=API_KEY, base_url=base_url)
     else:
         client = default_client
-    
+
+    # Handle 'auto' approach
     if approach == 'auto':
-        parts = model.split('-', 1)
-        approach = parts[0]
-        model = parts[1]
-    
+        for known_approach in known_approaches:
+            if model.startswith(f"{known_approach}-"):
+                approach = known_approach
+                model = model[len(known_approach)+1:]
+                break
+        else:
+            # If no known approach is found in the model name, default to 'bon'
+            approach = 'bon'
+
+
     logger.info(f'Using approach {approach}, with {model}')
-    
+
     try:
         if approach == 'mcts':
             final_response = chat_with_mcts(system_prompt, initial_query, client, model, server_config['mcts_simulations'],
@@ -106,7 +134,7 @@ def proxy():
     except Exception as e:
         logger.error(f"Error processing request: {str(e)}")
         return jsonify({"error": str(e)}), 500
-    
+
     response_data = {
         'model': model,
         'choices': []
@@ -135,41 +163,77 @@ def proxy():
     logger.debug(f'API response: {response_data}')
     return jsonify(response_data), 200
 
-def main():
+@app.route('/health', methods=['GET'])
+def health():
+    return jsonify({"status": "ok"}), 200
+
+
+def parse_args():
     parser = argparse.ArgumentParser(description="Run LLM inference with various approaches.")
-    parser.add_argument("--approach", type=str, choices=["auto", "mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar",
-                                                          "cot_reflection", "plansearch", "leap", "agent"], default="auto", help="Inference approach to use")
-    parser.add_argument("--simulations", type=int, default=2, help="Number of MCTS simulations")
-    parser.add_argument("--exploration", type=float, default=0.2, help="Exploration weight for MCTS")
-    parser.add_argument("--depth", type=int, default=1, help="Simulation depth for MCTS")
-    parser.add_argument("--best_of_n", type=int, default=3, help="Number of samples for best_of_n approach")
-    parser.add_argument("--model", type=str, default="gpt-4o-mini", help="OpenAI model to use")
-    parser.add_argument("--base_url", type=str, default="", help="Base url for OpenAI compatible endpoint")
-    parser.add_argument("--rstar-max-depth", type=int, default=3, help="Maximum depth for rStar algorithm")
-    parser.add_argument("--rstar-num-rollouts", type=int, default=5, help="Number of rollouts for rStar algorithm")
-    parser.add_argument("--rstar-c", type=float, default=1.4, help="Exploration constant for rStar algorithm")
-    parser.add_argument("--n", type=int, default=1, help="Number of final responses to be returned")
-    parser.add_argument("--return-full-response", type=bool, default=False, help="Return the full response including the CoT with <thinking> tags")
-    parser.add_argument("--port", type=int, default=8000, help="Specify the port to run the proxy")
+
+    # Define arguments and their corresponding environment variables
+    args_env = [
+        ("--api-key", "OPTILLM_API_KEY", str, "", "Optional API key for client authentication to optillm"),
+        ("--approach", "OPTILLM_APPROACH", str, "auto", "Inference approach to use", known_approaches),
+        ("--simulations", "OPTILLM_SIMULATIONS", int, 2, "Number of MCTS simulations"),
+        ("--exploration", "OPTILLM_EXPLORATION", float, 0.2, "Exploration weight for MCTS"),
+        ("--depth", "OPTILLM_DEPTH", int, 1, "Simulation depth for MCTS"),
+        ("--model", "OPTILLM_MODEL", str, "gpt-4o-mini", "OpenAI model to use"),
+        ("--rstar-max-depth", "OPTILLM_RSTAR_MAX_DEPTH", int, 3, "Maximum depth for rStar algorithm"),
+        ("--rstar-num-rollouts", "OPTILLM_RSTAR_NUM_ROLLOUTS", int, 5, "Number of rollouts for rStar algorithm"),
+        ("--rstar-c", "OPTILLM_RSTAR_C", float, 1.4, "Exploration constant for rStar algorithm"),
+        ("--n", "OPTILLM_N", int, 1, "Number of final responses to be returned"),
+        ("--return-full-response", "OPTILLM_RETURN_FULL_RESPONSE", bool, False, "Return the full response including the CoT with <thinking> tags"),
+        ("--port", "OPTILLM_PORT", int, 8000, "Specify the port to run the proxy"),
+    ]
+
+    for arg, env, type_, default, help_text, *extra in args_env:
+        env_value = os.environ.get(env)
+        if env_value is not None:
+            if type_ == bool:
+                default = env_value.lower() in ('true', '1', 'yes')
+            else:
+                default = type_(env_value)
+        if extra and extra[0]:  # Check if there are choices for this argument
+            parser.add_argument(arg, type=type_, default=default, help=help_text, choices=extra[0])
+        else:
+            parser.add_argument(arg, type=type_, default=default, help=help_text)
+
+    # Special handling for best_of_n to support both formats
+    best_of_n_default = int(os.environ.get("OPTILLM_BEST_OF_N", 3))
+    parser.add_argument("--best-of-n", "--best_of_n", dest="best_of_n", type=int, default=best_of_n_default,
+                        help="Number of samples for best_of_n approach")
+
+    # Special handling for base_url to support both formats
+    base_url_default = os.environ.get("OPTILLM_BASE_URL", "")
+    parser.add_argument("--base-url", "--base_url", dest="base_url", type=str, default=base_url_default,
+                        help="Base url for OpenAI compatible endpoint")
+
     args = parser.parse_args()
-    
-    server_config.update({
-        'model': args.model,
-        'approach': args.approach,
-        'mcts_simulations': args.simulations,
-        'mcts_exploration': args.exploration,
-        'mcts_depth': args.depth,
-        'best_of_n': args.best_of_n,
-        'rstar_max_depth': args.rstar_max_depth,
-        'rstar_num_rollouts': args.rstar_num_rollouts,
-        'rstar_c': args.rstar_c,
-        'base_url' : args.base_url,
-        'return_full_response': args.return_full_response,
-        'n': args.n,
-    })
-    port = args.port
-    logger.info(f"Starting server with approach: {args.approach}")
-    logger.info(f"Server configuration: {server_config}")
+
+    # Convert argument names to match server_config keys
+    args_dict = vars(args)
+    for key in list(args_dict.keys()):
+        new_key = key.replace("-", "_")
+        if new_key != key:
+            args_dict[new_key] = args_dict.pop(key)
+
+    return args
+
+
+def main():
+    global server_config
+    args = parse_args()
+
+    # Update server_config with all argument values
+    server_config.update(vars(args))
+
+    port = server_config['port']
+    logger.info(f"Starting server with approach: {server_config['approach']}")
+    server_config_clean = server_config.copy()
+    if server_config_clean['api_key']:
+        server_config_clean['api_key'] = '[REDACTED]'
+    logger.info(f"Server configuration: {server_config_clean}")
     app.run(host='0.0.0.0', port=port)
 
 if __name__ == "__main__":