From a8bc86210eec83b03dda31b74e15ddd02c1e2978 Mon Sep 17 00:00:00 2001 From: Sam Date: Sun, 15 Sep 2024 08:34:26 +1000 Subject: [PATCH] feat(docker,auth): Add Docker, Compose and optional auth --- Dockerfile | 47 ++++++++++++++ README.md | 119 +++++++++++++++++++++++++++-------- docker-compose.yaml | 38 +++++++++++ optillm.py | 150 +++++++++++++++++++++++++++++++------------- 4 files changed, 285 insertions(+), 69 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.yaml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5b22f47 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,47 @@ +# Build stage +FROM python:3.12-slim AS builder + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc libc6-dev \ + && rm -rf /var/lib/apt/lists/* + +# Copy only the requirements file first to leverage Docker cache +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Final stage +FROM python:3.12-slim + +# Install curl for the healthcheck +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy installed dependencies from builder stage +COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin + +# Copy application code +COPY . . + +# Create a non-root user and switch to it +RUN useradd -m appuser +USER appuser + +# Set environment variables +ENV PYTHONUNBUFFERED=1 + +# Expose the port the app runs on +EXPOSE 8000 + +# Run the application +ENTRYPOINT ["python", "optillm.py"] diff --git a/README.md b/README.md index f20de50..c03459d 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,16 @@ optillm is an OpenAI API compatible optimizing inference proxy which implements ### plansearch-gpt-4o-mini on LiveCodeBench (Sep 2024) -| Model | pass@1 | pass@5 | pass@10 | -|-------|--------|--------|---------| -| plansearch-gpt-4o-mini | 44.03 | 59.31 | 63.5 | -| gpt-4o-mini | 43.9 | 50.61 | 53.25 | -| claude-3.5-sonnet | 51.3 | | | -| gpt-4o-2024-05-13 | 45.2 | | | -| gpt-4-turbo-2024-04-09 | 44.2 | | | +| Model | pass@1 | pass@5 | pass@10 | +| ---------------------- | ------ | ------ | ------- | +| plansearch-gpt-4o-mini | 44.03 | 59.31 | 63.5 | +| gpt-4o-mini | 43.9 | 50.61 | 53.25 | +| claude-3.5-sonnet | 51.3 | | | +| gpt-4o-2024-05-13 | 45.2 | | | +| gpt-4-turbo-2024-04-09 | 44.2 | | | ### moa-gpt-4o-mini on Arena-Hard-Auto (Aug 2024) + ![Results showing Mixture of Agents approach using gpt-4o-mini on Arena Hard Auto Benchmark](./moa-results.png) ## Installation @@ -32,7 +33,7 @@ pip install -r requirements.txt You can then run the optillm proxy as follows. ```bash -python optillm.py +python optillm.py 2024-09-06 07:57:14,191 - INFO - Starting server with approach: auto 2024-09-06 07:57:14,191 - INFO - Server configuration: {'approach': 'auto', 'mcts_simulations': 2, 'mcts_exploration': 0.2, 'mcts_depth': 1, 'best_of_n': 3, 'model': 'gpt-4o-mini', 'rstar_max_depth': 3, 'rstar_num_rollouts': 5, 'rstar_c': 1.4, 'base_url': ''} * Serving Flask app 'optillm' @@ -44,11 +45,11 @@ python optillm.py 2024-09-06 07:57:14,212 - INFO - Press CTRL+C to quit ``` -### Usage +## Usage -Once the proxy is running, you can just use it as a drop in replacement for an OpenAI client by setting the `base_url` as `http://localhost:8000/v1`. +Once the proxy is running, you can use it as a drop in replacement for an OpenAI client by setting the `base_url` as `http://localhost:8000/v1`. -```bash +```python import os from openai import OpenAI @@ -70,7 +71,7 @@ response = client.chat.completions.create( print(response) ``` -You can control the technique you use for optimization by prepending the slug to the model name `{slug}-model-name`. E.g. in the above code we are using `moa` or +You can control the technique you use for optimization by prepending the slug to the model name `{slug}-model-name`. E.g. in the above code we are using `moa` or mixture of agents as the optimization approach. In the proxy logs you will see the following showing the `moa` is been used with the base model as `gpt-4o-mini`. ```bash @@ -83,20 +84,86 @@ mixture of agents as the optimization approach. In the proxy logs you will see t ## Implemented techniques -| Technique | Slug | Description | -|-----------|----------------|-------------| -| Agent | `agent ` | Determines which of the below approaches to take and then combines the results | -| Monte Carlo Tree Search | `mcts` | Uses MCTS for decision-making in chat responses | -| Best of N Sampling | `bon` | Generates multiple responses and selects the best one | -| Mixture of Agents | `moa` | Combines responses from multiple critiques | -| Round Trip Optimization | `rto` | Optimizes responses through a round-trip process | -| Z3 Solver | `z3` | Utilizes the Z3 theorem prover for logical reasoning | -| Self-Consistency | `self_consistency` | Implements an advanced self-consistency method | -| PV Game | `pvg` | Applies a prover-verifier game approach at inference time | -| R* Algorithm | `rstar` | Implements the R* algorithm for problem-solving | -| CoT with Reflection | `cot_reflection` | Implements chain-of-thought reasoning with \, \ and \ sections | -| PlanSearch | `plansearch` | Implements a search algorithm over candidate plans for solving a problem in natural language | -| LEAP | `leap` | Learns task-specific principles from few shot examples | +| Technique | Slug | Description | +| ----------------------- | ------------------ | ---------------------------------------------------------------------------------------------- | +| Agent | `agent` | Determines which of the below approaches to take and then combines the results | +| Monte Carlo Tree Search | `mcts` | Uses MCTS for decision-making in chat responses | +| Best of N Sampling | `bon` | Generates multiple responses and selects the best one | +| Mixture of Agents | `moa` | Combines responses from multiple critiques | +| Round Trip Optimization | `rto` | Optimizes responses through a round-trip process | +| Z3 Solver | `z3` | Utilizes the Z3 theorem prover for logical reasoning | +| Self-Consistency | `self_consistency` | Implements an advanced self-consistency method | +| PV Game | `pvg` | Applies a prover-verifier game approach at inference time | +| R* Algorithm | `rstar` | Implements the R* algorithm for problem-solving | +| CoT with Reflection | `cot_reflection` | Implements chain-of-thought reasoning with \, \ and \ sections | +| PlanSearch | `plansearch` | Implements a search algorithm over candidate plans for solving a problem in natural language | +| LEAP | `leap` | Learns task-specific principles from few shot examples | + +## Available Parameters + +optillm supports various command-line arguments and environment variables for configuration. + +| Parameter | Description | Default Value | +|--------------------------|-----------------------------------------------------------------|-----------------| +| `--approach` | Inference approach to use | `"auto"` | +| `--simulations` | Number of MCTS simulations | 2 | +| `--exploration` | Exploration weight for MCTS | 0.2 | +| `--depth` | Simulation depth for MCTS | 1 | +| `--best-of-n` | Number of samples for best_of_n approach | 3 | +| `--model` | OpenAI model to use | `"gpt-4o-mini"` | +| `--base-url` | Base URL for OpenAI compatible endpoint | `""` | +| `--rstar-max-depth` | Maximum depth for rStar algorithm | 3 | +| `--rstar-num-rollouts` | Number of rollouts for rStar algorithm | 5 | +| `--rstar-c` | Exploration constant for rStar algorithm | 1.4 | +| `--n` | Number of final responses to be returned | 1 | +| `--return-full-response` | Return the full response including the CoT with tags | `False` | +| `--port` | Specify the port to run the proxy | 8000 | +| `--api-key` | Optional API key for client authentication to optillm | `""` | + +When using Docker, these can be set as environment variables prefixed with `OPTILLM_`. + +## Running with Docker + +optillm can optionally be built and run using Docker and the provided [Dockerfile](./Dockerfile). + +### Using Docker Compose + +1. Make sure you have Docker and Docker Compose installed on your system. + +2. Either update the environment variables in the docker-compose.yaml file or create a `.env` file in the project root directory and add any environment variables you want to set. For example, to set the OpenAI API key, add the following line to the `.env` file: + + ```bash + OPENAI_API_KEY=your_openai_api_key_here + ``` + +3. Run the following command to start optillm: + + ```bash + docker compose up -d + ``` + + This will build the Docker image if it doesn't exist and start the optillm service. + +4. optillm will be available at `http://localhost:8000`. + +When using Docker, you can set these parameters as environment variables. For example, to set the approach and model, you would use: + +```bash +OPTILLM_APPROACH=mcts +OPTILLM_MODEL=gpt-4 +``` + +To secure the optillm proxy with an API key, set the `OPTILLM_API_KEY` environment variable: + +```bash +OPTILLM_API_KEY=your_secret_api_key +``` + +When the API key is set, clients must include it in their requests using the `Authorization` header: + +```plain +Authorization: Bearer your_secret_api_key +``` ## References diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..8e983ba --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,38 @@ +services: + &name optillm: + build: + context: https://github.com/codelion/optillm.git#main + # context: . + dockerfile: Dockerfile + tags: + - optillm:latest + image: optillm:latest + container_name: *name + hostname: *name + ports: + - "8000:8000" + environment: + OPENAI_API_KEY: ${OPENAI_API_KEY:-""} + OPTILLM_BASE_URL: ${OPENAI_BASE_URL:-"https://api.openai.com/v1"} + # OPTILLM_API_KEY: ${OPTILLM_API_KEY:-} # optionally sets an API key for Optillm clients + # Uncomment and set values for other arguments (prefixed with OPTILLM_) as needed, e.g.: + # OPTILLM_APPROACH: auto + # OPTILLM_MODEL: gpt-4o-mini + # OPTILLM_SIMULATIONS: 2 + # OPTILLM_EXPLORATION: 0.2 + # OPTILLM_DEPTH: 1 + # OPTILLM_BEST_OF_N: 3 + # OPTILLM_RSTAR_MAX_DEPTH: 3 + # OPTILLM_RSTAR_NUM_ROLLOUTS: 5 + # OPTILLM_RSTAR_C: 1.4 + # OPTILLM_N: 1 + # OPTILLM_RETURN_FULL_RESPONSE: false + # OPTILLM_PORT: 8000 + restart: on-failure + stop_grace_period: 2s + healthcheck: + test: ["CMD", "curl", "-f", "http://127.0.0.1:8000/health"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 3s diff --git a/optillm.py b/optillm.py index 6fcd716..f6d8134 100644 --- a/optillm.py +++ b/optillm.py @@ -1,6 +1,7 @@ import argparse import logging import os +import secrets from flask import Flask, request, jsonify from openai import OpenAI @@ -41,21 +42,41 @@ 'rstar_num_rollouts': 5, 'rstar_c': 1.4, 'n': 1, + 'base_url': '', + 'api_key': '', + 'return_full_response': False, + 'port': 8000, } +# List of known approaches +known_approaches = ["mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar", + "cot_reflection", "plansearch", "leap", "agent"] + +# Optional API key configuration to secure the proxy +@app.before_request +def check_api_key(): + if server_config['api_key']: + auth_header = request.headers.get('Authorization') + if not auth_header or not auth_header.startswith('Bearer '): + return jsonify({"error": "Invalid Authorization header. Expected format: 'Authorization: Bearer YOUR_API_KEY'"}), 401 + + client_key = auth_header.split('Bearer ', 1)[1].strip() + if not secrets.compare_digest(client_key, server_config['api_key']): + return jsonify({"error": "Invalid API key"}), 401 + @app.route('/v1/chat/completions', methods=['POST']) def proxy(): logger.info('Received request to /v1/chat/completions') data = request.get_json() logger.debug(f'Request data: {data}') - + messages = data.get('messages', []) model = data.get('model', server_config['model']) n = data.get('n', server_config['n']) - + system_prompt = next((msg['content'] for msg in messages if msg['role'] == 'system'), "") initial_query = next((msg['content'] for msg in messages if msg['role'] == 'user'), "") - + approach = server_config['approach'] base_url = server_config['base_url'] @@ -63,14 +84,21 @@ def proxy(): client = OpenAI(api_key=API_KEY, base_url=base_url) else: client = default_client - + + # Handle 'auto' approach if approach == 'auto': - parts = model.split('-', 1) - approach = parts[0] - model = parts[1] - + for known_approach in known_approaches: + if model.startswith(f"{known_approach}-"): + approach = known_approach + model = model[len(known_approach)+1:] + break + else: + # If no known approach is found in the model name, default to 'bon' + approach = 'bon' + + logger.info(f'Using approach {approach}, with {model}') - + try: if approach == 'mcts': final_response = chat_with_mcts(system_prompt, initial_query, client, model, server_config['mcts_simulations'], @@ -106,7 +134,7 @@ def proxy(): except Exception as e: logger.error(f"Error processing request: {str(e)}") return jsonify({"error": str(e)}), 500 - + response_data = { 'model': model, 'choices': [] @@ -135,41 +163,77 @@ def proxy(): logger.debug(f'API response: {response_data}') return jsonify(response_data), 200 -def main(): +@app.route('/health', methods=['GET']) +def health(): + return jsonify({"status": "ok"}), 200 + + +def parse_args(): parser = argparse.ArgumentParser(description="Run LLM inference with various approaches.") - parser.add_argument("--approach", type=str, choices=["auto", "mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar", - "cot_reflection", "plansearch", "leap", "agent"], default="auto", help="Inference approach to use") - parser.add_argument("--simulations", type=int, default=2, help="Number of MCTS simulations") - parser.add_argument("--exploration", type=float, default=0.2, help="Exploration weight for MCTS") - parser.add_argument("--depth", type=int, default=1, help="Simulation depth for MCTS") - parser.add_argument("--best_of_n", type=int, default=3, help="Number of samples for best_of_n approach") - parser.add_argument("--model", type=str, default="gpt-4o-mini", help="OpenAI model to use") - parser.add_argument("--base_url", type=str, default="", help="Base url for OpenAI compatible endpoint") - parser.add_argument("--rstar-max-depth", type=int, default=3, help="Maximum depth for rStar algorithm") - parser.add_argument("--rstar-num-rollouts", type=int, default=5, help="Number of rollouts for rStar algorithm") - parser.add_argument("--rstar-c", type=float, default=1.4, help="Exploration constant for rStar algorithm") - parser.add_argument("--n", type=int, default=1, help="Number of final responses to be returned") - parser.add_argument("--return-full-response", type=bool, default=False, help="Return the full response including the CoT with tags") - parser.add_argument("--port", type=int, default=8000, help="Specify the port to run the proxy") + + # Define arguments and their corresponding environment variables + args_env = [ + ("--api-key", "OPTILLM_API_KEY", str, "", "Optional API key for client authentication to optillm"), + ("--approach", "OPTILLM_APPROACH", str, "auto", "Inference approach to use", known_approaches), + ("--simulations", "OPTILLM_SIMULATIONS", int, 2, "Number of MCTS simulations"), + ("--exploration", "OPTILLM_EXPLORATION", float, 0.2, "Exploration weight for MCTS"), + ("--depth", "OPTILLM_DEPTH", int, 1, "Simulation depth for MCTS"), + ("--model", "OPTILLM_MODEL", str, "gpt-4o-mini", "OpenAI model to use"), + ("--rstar-max-depth", "OPTILLM_RSTAR_MAX_DEPTH", int, 3, "Maximum depth for rStar algorithm"), + ("--rstar-num-rollouts", "OPTILLM_RSTAR_NUM_ROLLOUTS", int, 5, "Number of rollouts for rStar algorithm"), + ("--rstar-c", "OPTILLM_RSTAR_C", float, 1.4, "Exploration constant for rStar algorithm"), + ("--n", "OPTILLM_N", int, 1, "Number of final responses to be returned"), + ("--return-full-response", "OPTILLM_RETURN_FULL_RESPONSE", bool, False, "Return the full response including the CoT with tags"), + ("--port", "OPTILLM_PORT", int, 8000, "Specify the port to run the proxy"), + ] + + for arg, env, type_, default, help_text, *extra in args_env: + env_value = os.environ.get(env) + if env_value is not None: + if type_ == bool: + default = env_value.lower() in ('true', '1', 'yes') + else: + default = type_(env_value) + if extra and extra[0]: # Check if there are choices for this argument + parser.add_argument(arg, type=type_, default=default, help=help_text, choices=extra[0]) + else: + parser.add_argument(arg, type=type_, default=default, help=help_text) + + # Special handling for best_of_n to support both formats + best_of_n_default = int(os.environ.get("OPTILLM_BEST_OF_N", 3)) + parser.add_argument("--best-of-n", "--best_of_n", dest="best_of_n", type=int, default=best_of_n_default, + help="Number of samples for best_of_n approach") + + # Special handling for base_url to support both formats + base_url_default = os.environ.get("OPTILLM_BASE_URL", "") + parser.add_argument("--base-url", "--base_url", dest="base_url", type=str, default=base_url_default, + help="Base url for OpenAI compatible endpoint") + args = parser.parse_args() - - server_config.update({ - 'model': args.model, - 'approach': args.approach, - 'mcts_simulations': args.simulations, - 'mcts_exploration': args.exploration, - 'mcts_depth': args.depth, - 'best_of_n': args.best_of_n, - 'rstar_max_depth': args.rstar_max_depth, - 'rstar_num_rollouts': args.rstar_num_rollouts, - 'rstar_c': args.rstar_c, - 'base_url' : args.base_url, - 'return_full_response': args.return_full_response, - 'n': args.n, - }) - port = args.port - logger.info(f"Starting server with approach: {args.approach}") - logger.info(f"Server configuration: {server_config}") + + # Convert argument names to match server_config keys + args_dict = vars(args) + for key in list(args_dict.keys()): + new_key = key.replace("-", "_") + if new_key != key: + args_dict[new_key] = args_dict.pop(key) + + return args + + +def main(): + global server_config + args = parse_args() + + # Update server_config with all argument values + server_config.update(vars(args)) + + port = server_config['port'] + logger.info(f"Starting server with approach: {server_config['approach']}") + server_config_clean = server_config.copy() + if server_config_clean['api_key']: + server_config_clean['api_key'] = '[REDACTED]' + logger.info(f"Server configuration: {server_config_clean}") app.run(host='0.0.0.0', port=port) if __name__ == "__main__":