Skip to content

Commit

Permalink
OPIK-665: Add Python code executor PoC Docker sandbox
Browse files Browse the repository at this point in the history
  • Loading branch information
andrescrz committed Jan 16, 2025
1 parent 720a0c3 commit 4071b8c
Show file tree
Hide file tree
Showing 11 changed files with 228 additions and 146 deletions.
22 changes: 22 additions & 0 deletions apps/opik-python-backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
FROM docker:latest

ENV DOCKER_HOST=unix:///var/run/docker.sock

RUN apk update && apk upgrade \
&& apk add --no-cache \
python3 python3-dev py3-pip \
libffi-dev openssl-dev build-base git curl bash \
cargo gcc musl-dev

WORKDIR /opt/opik-python-backend

COPY requirements.txt .
RUN pip install -r requirements.txt --break-system-packages

COPY src ./src

EXPOSE 8000

CMD dockerd-entrypoint.sh & \
sleep 5 \
&& gunicorn --workers 4 --bind=0.0.0.0:8000 --chdir ./src 'opik_backend:create_app()'
4 changes: 2 additions & 2 deletions apps/opik-python-backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

## Requirements

- Install Python: at least the minimum version compatible with the Opik Python SDK.
- Install Python.
- Create and enable a Python virtual environment.
- Install all dependencies from `requirements.txt`.
- For running tests, also install dependencies from `tests/test_requirements.txt`.

## Running the Flask service

> [!TIP]
> Run in debug mode for development purposes, it reloads the code automatically.
> Run it in debug mode for development purposes, it reloads the code automatically.
- From `apps/opik-python-backend` directory.
- Run the `opik_backend` module.
Expand Down
58 changes: 6 additions & 52 deletions apps/opik-python-backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,61 +1,15 @@
aiohappyeyeballs==2.4.4
aiohttp==3.11.11
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.7.0
attrs==24.3.0
blinker==1.9.0
certifi==2024.12.14
charset-normalizer==3.4.0
click==8.1.7
distro==1.9.0
filelock==3.16.1
charset-normalizer==3.4.1
click==8.1.8
docker==7.1.0
Flask==3.1.0
frozenlist==1.5.0
fsspec==2024.12.0
h11==0.14.0
httpcore==1.0.7
httpx==0.27.2
huggingface-hub==0.27.0
gunicorn==23.0.0
idna==3.10
importlib_metadata==8.5.0
iniconfig==2.0.0
itsdangerous==2.2.0
Jinja2==3.1.4
jiter==0.8.2
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
Levenshtein==0.26.1
litellm==1.55.7
markdown-it-py==3.0.0
Jinja2==3.1.5
MarkupSafe==3.0.2
mdurl==0.1.2
multidict==6.1.0
openai==1.58.1
opik==1.3.0
packaging==24.2
pluggy==1.5.0
propcache==0.2.1
pydantic==2.10.4
pydantic-settings==2.7.0
pydantic_core==2.27.2
Pygments==2.18.0
python-dotenv==1.0.1
PyYAML==6.0.2
RapidFuzz==3.11.0
referencing==0.35.1
regex==2024.11.6
requests==2.32.3
rich==13.9.4
rpds-py==0.22.3
sniffio==1.3.1
tenacity==9.0.0
tiktoken==0.8.0
tokenizers==0.21.0
tqdm==4.67.1
typing_extensions==4.12.2
urllib3==2.2.3
uuid7==0.1.0
urllib3==2.3.0
Werkzeug==3.1.3
yarl==1.18.3
zipp==3.21.0
7 changes: 7 additions & 0 deletions apps/opik-python-backend/src/opik_backend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,11 @@ def create_app(test_config=None):
from opik_backend.evaluator import evaluator
app.register_blueprint(evaluator)

# TODO: optimize creation e.g: at service build time
from opik_backend.docker_runner import \
create_docker_image, \
PYTHON_CODE_EXECUTOR_DOCKERFILE, \
PYTHON_CODE_EXECUTOR_IMAGE_NAME_AND_TAG
create_docker_image(PYTHON_CODE_EXECUTOR_DOCKERFILE, PYTHON_CODE_EXECUTOR_IMAGE_NAME_AND_TAG, )

return app
63 changes: 63 additions & 0 deletions apps/opik-python-backend/src/opik_backend/docker_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import io
import json
import logging

import docker

from opik_backend.scoring_commands import PYTHON_SCORING_COMMAND

logger = logging.getLogger(__name__)

PYTHON_CODE_EXECUTOR_IMAGE_NAME_AND_TAG = "opik-executor-sandbox-python:latest"

# TODO: Optimise Dockerfile definition e.g: use physical file
PYTHON_CODE_EXECUTOR_DOCKERFILE = """
FROM python:3.12.3-slim
RUN pip install opik
"""


def create_docker_image(dockerfile_string, image_name):
client = docker.from_env()
try:
_, logs = client.images.build(
fileobj=io.BytesIO(dockerfile_string.encode('utf-8')),
tag=image_name
)
for log in logs:
logger.info(log.get('stream', '').strip())
logger.info(f"Image '{image_name}' created successfully.")
except Exception as e:
logger.error(f"Error building image '{image_name}': {e}")
raise e


def run_scoring_in_docker_python_container(code, data):
client = docker.from_env()
try:
# TODO: Optimise run latency e.g: pre-allocating containers
container = client.containers.run(
image=PYTHON_CODE_EXECUTOR_IMAGE_NAME_AND_TAG,
command=["python", "-c", PYTHON_SCORING_COMMAND, code, json.dumps(data)],
mem_limit="128mb",
cpu_shares=2,
detach=True,
network_disabled=True,
security_opt=["no-new-privileges"],
)
try:
result = container.wait(timeout=3)
logs = container.logs().decode("utf-8")
status_code = result["StatusCode"]
if status_code == 0:
last_line = logs.strip().splitlines()[-1]
# TODO: Validate JSON response e.g: schema validation
return json.loads(last_line)
else:
logging.warn(f"Execution failed (Code: {status_code}):\n{logs}")
return {"code": 400, "error": "Execution failed: Python code contains an invalid metric"}
finally:
container.remove()
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")
return {"code": 500, "error": "An unexpected error occurred"}
66 changes: 9 additions & 57 deletions apps/opik-python-backend/src/opik_backend/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,20 @@
import inspect
from types import ModuleType
from typing import Type, Union, List, Any, Dict
from typing import Any, Dict

from flask import request, abort, jsonify, Blueprint, current_app
from opik.evaluation.metrics import BaseMetric
from opik.evaluation.metrics.score_result import ScoreResult
from werkzeug.exceptions import HTTPException

from .helpers.id_helpers import uuid4_str
from opik_backend.docker_runner import run_scoring_in_docker_python_container

evaluator = Blueprint('evaluator', __name__, url_prefix='/v1/private/evaluators')


def get_module(code: str, module_name: str = uuid4_str()) -> ModuleType:
module: ModuleType = ModuleType(module_name)
exec(code, module.__dict__)
return module


def get_metric_class(module: ModuleType) -> Type[BaseMetric]:
for _, cls in inspect.getmembers(module, inspect.isclass):
if issubclass(cls, BaseMetric):
return cls


def evaluate_metric(metric_class: Type[BaseMetric], data: Dict[Any, Any]) -> Union[ScoreResult, List[ScoreResult]]:
base_metric: BaseMetric = metric_class()
return base_metric.score(**data)


def to_scores(score_result: Union[ScoreResult, List[ScoreResult]]) -> List[ScoreResult]:
scores: List[ScoreResult] = []
if isinstance(score_result, ScoreResult):
scores = [score_result]
elif isinstance(score_result, list):
for item in score_result:
if isinstance(item, ScoreResult):
scores.append(item)
return scores


@evaluator.errorhandler(400)
def bad_request(exception: HTTPException):
return jsonify(error=str(exception)), 400


@evaluator.route("", methods=["POST"])
def execute_evaluator():
@evaluator.route("/python", methods=["POST"])
def execute_evaluator_python():
if request.method != "POST":
return

Expand All @@ -60,29 +28,13 @@ def execute_evaluator():
if data is None:
abort(400, "Field 'data' is missing in the request")

try:
module: ModuleType = get_module(code)
metric_class: Type[BaseMetric] = get_metric_class(module)
except Exception as exception:
current_app.logger.info("Exception getting metric class, message '%s', code '%s'", exception, code)
abort(400, "Field 'code' contains invalid Python code")

if metric_class is None:
current_app.logger.info("Missing BaseMetric in code '%s'", code)
abort(400,
"Field 'code' in the request doesn't contain a subclass implementation of 'opik.evaluation.metrics.BaseMetric'")

score_result: List[ScoreResult] = []
try:
score_result = evaluate_metric(metric_class, data)
except Exception as exception:
current_app.logger.info("Exception evaluating metric, message '%s', data '%s', code '%s'",
exception, data, code)
abort(400, "The provided 'code' and 'data' fields can't be evaluated")
response = run_scoring_in_docker_python_container(code, data)
if "error" in response:
abort(response["code"], response["error"])

scores: List[ScoreResult] = to_scores(score_result)
scores = response.get("scores", [])
if len(scores) == 0:
current_app.logger.info("Missing ScoreResult in code '%s'", code)
abort(400, "The provided 'code' field didn't return any 'opik.evaluation.metrics.ScoreResult'")

return jsonify({"scores": scores})
return jsonify(response)
Empty file.

This file was deleted.

52 changes: 52 additions & 0 deletions apps/opik-python-backend/src/opik_backend/scoring_commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
PYTHON_SCORING_COMMAND = """
import inspect
import json
import uuid
from sys import argv
from types import ModuleType
from typing import Type, Union, List, Any, Dict
from opik.evaluation.metrics import BaseMetric
from opik.evaluation.metrics.score_result import ScoreResult
def get_module(code: str) -> ModuleType:
module_name = str(uuid.uuid4())
module = ModuleType(module_name)
exec(code, module.__dict__)
return module
def get_metric_class(module: ModuleType) -> Type[BaseMetric]:
for _, cls in inspect.getmembers(module, inspect.isclass):
if issubclass(cls, BaseMetric):
return cls
def evaluate_metric(metric_class: Type[BaseMetric], data: Dict[Any, Any]) -> Union[ScoreResult, List[ScoreResult]]:
metric = metric_class()
return metric.score(**data)
def to_scores(score_result: Union[ScoreResult, List[ScoreResult]]) -> List[ScoreResult]:
scores = []
if isinstance(score_result, ScoreResult):
scores = [score_result]
elif isinstance(score_result, list):
for item in score_result:
if isinstance(item, ScoreResult):
scores.append(item)
return scores
code = argv[1]
data = json.loads(argv[2])
module = get_module(code)
metric_class = get_metric_class(module)
score_result = evaluate_metric(metric_class, data)
scores = to_scores(score_result)
response = json.dumps({"scores": [score.__dict__ for score in scores]})
print(response)
"""
Loading

0 comments on commit 4071b8c

Please sign in to comment.