diff --git a/apps/opik-python-backend/Dockerfile b/apps/opik-python-backend/Dockerfile new file mode 100644 index 0000000000..bffe6cb501 --- /dev/null +++ b/apps/opik-python-backend/Dockerfile @@ -0,0 +1,22 @@ +FROM docker:latest + +ENV DOCKER_HOST=unix:///var/run/docker.sock + +RUN apk update && apk upgrade \ + && apk add --no-cache \ + python3 python3-dev py3-pip \ + libffi-dev openssl-dev build-base git curl bash \ + cargo gcc musl-dev + +WORKDIR /opt/opik-python-backend + +COPY requirements.txt . +RUN pip install -r requirements.txt --break-system-packages + +COPY src ./src + +EXPOSE 8000 + +CMD dockerd-entrypoint.sh & \ + sleep 5 \ + && gunicorn --workers 4 --bind=0.0.0.0:8000 --chdir ./src 'opik_backend:create_app()' diff --git a/apps/opik-python-backend/README.md b/apps/opik-python-backend/README.md index 9a43dbe366..3804b74822 100644 --- a/apps/opik-python-backend/README.md +++ b/apps/opik-python-backend/README.md @@ -2,7 +2,7 @@ ## Requirements -- Install Python: at least the minimum version compatible with the Opik Python SDK. +- Install Python. - Create and enable a Python virtual environment. - Install all dependencies from `requirements.txt`. - For running tests, also install dependencies from `tests/test_requirements.txt`. @@ -10,7 +10,7 @@ ## Running the Flask service > [!TIP] -> Run in debug mode for development purposes, it reloads the code automatically. +> Run it in debug mode for development purposes, it reloads the code automatically. - From `apps/opik-python-backend` directory. - Run the `opik_backend` module. diff --git a/apps/opik-python-backend/requirements.txt b/apps/opik-python-backend/requirements.txt index 83ccf1f693..afd40bac0b 100644 --- a/apps/opik-python-backend/requirements.txt +++ b/apps/opik-python-backend/requirements.txt @@ -1,61 +1,15 @@ -aiohappyeyeballs==2.4.4 -aiohttp==3.11.11 -aiosignal==1.3.2 -annotated-types==0.7.0 -anyio==4.7.0 -attrs==24.3.0 blinker==1.9.0 certifi==2024.12.14 -charset-normalizer==3.4.0 -click==8.1.7 -distro==1.9.0 -filelock==3.16.1 +charset-normalizer==3.4.1 +click==8.1.8 +docker==7.1.0 Flask==3.1.0 -frozenlist==1.5.0 -fsspec==2024.12.0 -h11==0.14.0 -httpcore==1.0.7 -httpx==0.27.2 -huggingface-hub==0.27.0 +gunicorn==23.0.0 idna==3.10 -importlib_metadata==8.5.0 -iniconfig==2.0.0 itsdangerous==2.2.0 -Jinja2==3.1.4 -jiter==0.8.2 -jsonschema==4.23.0 -jsonschema-specifications==2024.10.1 -Levenshtein==0.26.1 -litellm==1.55.7 -markdown-it-py==3.0.0 +Jinja2==3.1.5 MarkupSafe==3.0.2 -mdurl==0.1.2 -multidict==6.1.0 -openai==1.58.1 -opik==1.3.0 packaging==24.2 -pluggy==1.5.0 -propcache==0.2.1 -pydantic==2.10.4 -pydantic-settings==2.7.0 -pydantic_core==2.27.2 -Pygments==2.18.0 -python-dotenv==1.0.1 -PyYAML==6.0.2 -RapidFuzz==3.11.0 -referencing==0.35.1 -regex==2024.11.6 requests==2.32.3 -rich==13.9.4 -rpds-py==0.22.3 -sniffio==1.3.1 -tenacity==9.0.0 -tiktoken==0.8.0 -tokenizers==0.21.0 -tqdm==4.67.1 -typing_extensions==4.12.2 -urllib3==2.2.3 -uuid7==0.1.0 +urllib3==2.3.0 Werkzeug==3.1.3 -yarl==1.18.3 -zipp==3.21.0 diff --git a/apps/opik-python-backend/src/opik_backend/__init__.py b/apps/opik-python-backend/src/opik_backend/__init__.py index 3482ee389a..fafb50f580 100644 --- a/apps/opik-python-backend/src/opik_backend/__init__.py +++ b/apps/opik-python-backend/src/opik_backend/__init__.py @@ -14,4 +14,11 @@ def create_app(test_config=None): from opik_backend.evaluator import evaluator app.register_blueprint(evaluator) + # TODO: optimize creation e.g: at service build time + from opik_backend.docker_runner import \ + create_docker_image, \ + PYTHON_CODE_EXECUTOR_DOCKERFILE, \ + PYTHON_CODE_EXECUTOR_IMAGE_NAME_AND_TAG + create_docker_image(PYTHON_CODE_EXECUTOR_DOCKERFILE, PYTHON_CODE_EXECUTOR_IMAGE_NAME_AND_TAG, ) + return app diff --git a/apps/opik-python-backend/src/opik_backend/docker_runner.py b/apps/opik-python-backend/src/opik_backend/docker_runner.py new file mode 100644 index 0000000000..389e60b1d1 --- /dev/null +++ b/apps/opik-python-backend/src/opik_backend/docker_runner.py @@ -0,0 +1,63 @@ +import io +import json +import logging + +import docker + +from opik_backend.scoring_commands import PYTHON_SCORING_COMMAND + +logger = logging.getLogger(__name__) + +PYTHON_CODE_EXECUTOR_IMAGE_NAME_AND_TAG = "opik-executor-sandbox-python:latest" + +# TODO: Optimise Dockerfile definition e.g: use physical file +PYTHON_CODE_EXECUTOR_DOCKERFILE = """ +FROM python:3.12.3-slim +RUN pip install opik +""" + + +def create_docker_image(dockerfile_string, image_name): + client = docker.from_env() + try: + _, logs = client.images.build( + fileobj=io.BytesIO(dockerfile_string.encode('utf-8')), + tag=image_name + ) + for log in logs: + logger.info(log.get('stream', '').strip()) + logger.info(f"Image '{image_name}' created successfully.") + except Exception as e: + logger.error(f"Error building image '{image_name}': {e}") + raise e + + +def run_scoring_in_docker_python_container(code, data): + client = docker.from_env() + try: + # TODO: Optimise run latency e.g: pre-allocating containers + container = client.containers.run( + image=PYTHON_CODE_EXECUTOR_IMAGE_NAME_AND_TAG, + command=["python", "-c", PYTHON_SCORING_COMMAND, code, json.dumps(data)], + mem_limit="128mb", + cpu_shares=2, + detach=True, + network_disabled=True, + security_opt=["no-new-privileges"], + ) + try: + result = container.wait(timeout=3) + logs = container.logs().decode("utf-8") + status_code = result["StatusCode"] + if status_code == 0: + last_line = logs.strip().splitlines()[-1] + # TODO: Validate JSON response e.g: schema validation + return json.loads(last_line) + else: + logging.warn(f"Execution failed (Code: {status_code}):\n{logs}") + return {"code": 400, "error": "Execution failed: Python code contains an invalid metric"} + finally: + container.remove() + except Exception as e: + logger.error(f"An unexpected error occurred: {e}") + return {"code": 500, "error": "An unexpected error occurred"} diff --git a/apps/opik-python-backend/src/opik_backend/evaluator.py b/apps/opik-python-backend/src/opik_backend/evaluator.py index 3b1f81e032..83d018b7b1 100644 --- a/apps/opik-python-backend/src/opik_backend/evaluator.py +++ b/apps/opik-python-backend/src/opik_backend/evaluator.py @@ -1,52 +1,20 @@ -import inspect -from types import ModuleType -from typing import Type, Union, List, Any, Dict +from typing import Any, Dict from flask import request, abort, jsonify, Blueprint, current_app -from opik.evaluation.metrics import BaseMetric -from opik.evaluation.metrics.score_result import ScoreResult from werkzeug.exceptions import HTTPException -from .helpers.id_helpers import uuid4_str +from opik_backend.docker_runner import run_scoring_in_docker_python_container evaluator = Blueprint('evaluator', __name__, url_prefix='/v1/private/evaluators') -def get_module(code: str, module_name: str = uuid4_str()) -> ModuleType: - module: ModuleType = ModuleType(module_name) - exec(code, module.__dict__) - return module - - -def get_metric_class(module: ModuleType) -> Type[BaseMetric]: - for _, cls in inspect.getmembers(module, inspect.isclass): - if issubclass(cls, BaseMetric): - return cls - - -def evaluate_metric(metric_class: Type[BaseMetric], data: Dict[Any, Any]) -> Union[ScoreResult, List[ScoreResult]]: - base_metric: BaseMetric = metric_class() - return base_metric.score(**data) - - -def to_scores(score_result: Union[ScoreResult, List[ScoreResult]]) -> List[ScoreResult]: - scores: List[ScoreResult] = [] - if isinstance(score_result, ScoreResult): - scores = [score_result] - elif isinstance(score_result, list): - for item in score_result: - if isinstance(item, ScoreResult): - scores.append(item) - return scores - - @evaluator.errorhandler(400) def bad_request(exception: HTTPException): return jsonify(error=str(exception)), 400 -@evaluator.route("", methods=["POST"]) -def execute_evaluator(): +@evaluator.route("/python", methods=["POST"]) +def execute_evaluator_python(): if request.method != "POST": return @@ -60,29 +28,13 @@ def execute_evaluator(): if data is None: abort(400, "Field 'data' is missing in the request") - try: - module: ModuleType = get_module(code) - metric_class: Type[BaseMetric] = get_metric_class(module) - except Exception as exception: - current_app.logger.info("Exception getting metric class, message '%s', code '%s'", exception, code) - abort(400, "Field 'code' contains invalid Python code") - - if metric_class is None: - current_app.logger.info("Missing BaseMetric in code '%s'", code) - abort(400, - "Field 'code' in the request doesn't contain a subclass implementation of 'opik.evaluation.metrics.BaseMetric'") - - score_result: List[ScoreResult] = [] - try: - score_result = evaluate_metric(metric_class, data) - except Exception as exception: - current_app.logger.info("Exception evaluating metric, message '%s', data '%s', code '%s'", - exception, data, code) - abort(400, "The provided 'code' and 'data' fields can't be evaluated") + response = run_scoring_in_docker_python_container(code, data) + if "error" in response: + abort(response["code"], response["error"]) - scores: List[ScoreResult] = to_scores(score_result) + scores = response.get("scores", []) if len(scores) == 0: current_app.logger.info("Missing ScoreResult in code '%s'", code) abort(400, "The provided 'code' field didn't return any 'opik.evaluation.metrics.ScoreResult'") - return jsonify({"scores": scores}) + return jsonify(response) diff --git a/apps/opik-python-backend/src/opik_backend/helpers/__init__.py b/apps/opik-python-backend/src/opik_backend/helpers/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py b/apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py deleted file mode 100644 index 56b8cb476b..0000000000 --- a/apps/opik-python-backend/src/opik_backend/helpers/id_helpers.py +++ /dev/null @@ -1,5 +0,0 @@ -import uuid - - -def uuid4_str() -> str: - return str(uuid.uuid4()) diff --git a/apps/opik-python-backend/src/opik_backend/scoring_commands.py b/apps/opik-python-backend/src/opik_backend/scoring_commands.py new file mode 100644 index 0000000000..0ed1e07d92 --- /dev/null +++ b/apps/opik-python-backend/src/opik_backend/scoring_commands.py @@ -0,0 +1,52 @@ +PYTHON_SCORING_COMMAND = """ +import inspect +import json +import uuid +from sys import argv +from types import ModuleType +from typing import Type, Union, List, Any, Dict + +from opik.evaluation.metrics import BaseMetric +from opik.evaluation.metrics.score_result import ScoreResult + + +def get_module(code: str) -> ModuleType: + module_name = str(uuid.uuid4()) + module = ModuleType(module_name) + exec(code, module.__dict__) + return module + + +def get_metric_class(module: ModuleType) -> Type[BaseMetric]: + for _, cls in inspect.getmembers(module, inspect.isclass): + if issubclass(cls, BaseMetric): + return cls + + +def evaluate_metric(metric_class: Type[BaseMetric], data: Dict[Any, Any]) -> Union[ScoreResult, List[ScoreResult]]: + metric = metric_class() + return metric.score(**data) + + +def to_scores(score_result: Union[ScoreResult, List[ScoreResult]]) -> List[ScoreResult]: + scores = [] + if isinstance(score_result, ScoreResult): + scores = [score_result] + elif isinstance(score_result, list): + for item in score_result: + if isinstance(item, ScoreResult): + scores.append(item) + return scores + + +code = argv[1] +data = json.loads(argv[2]) + +module = get_module(code) +metric_class = get_metric_class(module) +score_result = evaluate_metric(metric_class, data) +scores = to_scores(score_result) + +response = json.dumps({"scores": [score.__dict__ for score in scores]}) +print(response) +""" diff --git a/apps/opik-python-backend/tests/test_evaluator.py b/apps/opik-python-backend/tests/test_evaluator_python.py similarity index 70% rename from apps/opik-python-backend/tests/test_evaluator.py rename to apps/opik-python-backend/tests/test_evaluator_python.py index 103600d707..65a73ea0e9 100644 --- a/apps/opik-python-backend/tests/test_evaluator.py +++ b/apps/opik-python-backend/tests/test_evaluator_python.py @@ -1,7 +1,6 @@ import pytest -from opik.evaluation.metrics.score_result import ScoreResult -EVALUATORS_URL = "/v1/private/evaluators" +EVALUATORS_URL = "/v1/private/evaluators/python" USER_DEFINED_METRIC = """ from typing import Any @@ -35,7 +34,7 @@ def score( class UserDefinedEquals(base_metric.BaseMetric): def __init__( self, - name: str = "user_defined_equals_metric", + name: str = "user_defined_list_equals_metric", ): super().__init__( name=name, @@ -139,6 +138,29 @@ def score( return None """ +FLASK_INJECTION_METRIC = """ +from typing import Any + +import flask +from opik.evaluation.metrics import base_metric, score_result + + +class FlaskInjectionMetric(base_metric.BaseMetric): + def __init__(self, name: str = "flask_injection_metric", ): + super().__init__(name=name, track=False) + + def score(self, **ignored_kwargs: Any) -> score_result.ScoreResult: + # Replace all view functions with a function that returns an error + def error_response(*args, **kwargs): + return "Service Unavailable because it was hacked", 503 + + for endpoint in flask.current_app.view_functions: + flask.current_app.view_functions[endpoint] = error_response + + return score_result.ScoreResult(value=0.0, name=self.name) + +""" + DATA = { "output": "abc", "reference": "abc" @@ -149,20 +171,49 @@ def score( ( DATA, USER_DEFINED_METRIC, - [ScoreResult(name="user_defined_equals_metric", value=1.0).__dict__]), + [ + { + "metadata": None, + "name": 'user_defined_equals_metric', + "reason": None, + "scoring_failed": False, + "value": 1.0 + } + ] + ), ( {"output": "abc", "reference": "ab"}, USER_DEFINED_METRIC, - [ScoreResult(name="user_defined_equals_metric", value=0.0).__dict__]), + [ + { + "metadata": None, + "name": 'user_defined_equals_metric', + "reason": None, + "scoring_failed": False, + "value": 0.0 + } + ] + ), ( DATA, LIST_RESPONSE_METRIC, [ - ScoreResult(name="user_defined_equals_metric", value=1.0).__dict__, - ScoreResult(name="user_defined_equals_metric", value=0.5).__dict__, + { + "metadata": None, + "name": 'user_defined_list_equals_metric', + "reason": None, + "scoring_failed": False, + "value": 1.0 + }, + { + "metadata": None, + "name": 'user_defined_list_equals_metric', + "reason": None, + "scoring_failed": False, + "value": 0.5 + }, ] ), - ]) def test_success(client, data, code, expected): response = client.post(EVALUATORS_URL, json={ @@ -208,32 +259,14 @@ def test_missing_data_returns_bad_request(client): assert response.json["error"] == "400 Bad Request: Field 'data' is missing in the request" -def test_invalid_code_returns_bad_request(client): +@pytest.mark.parametrize("code", [INVALID_METRIC, MISSING_BASE_METRIC, SCORE_EXCEPTION_METRIC, FLASK_INJECTION_METRIC]) +def test_invalid_code_returns_bad_request(client, code): response = client.post(EVALUATORS_URL, json={ "data": DATA, - "code": INVALID_METRIC - }) - assert response.status_code == 400 - assert response.json["error"] == "400 Bad Request: Field 'code' contains invalid Python code" - - -def test_missing_metric_returns_bad_request(client): - response = client.post(EVALUATORS_URL, json={ - "data": DATA, - "code": MISSING_BASE_METRIC - }) - assert response.status_code == 400 - assert response.json[ - "error"] == "400 Bad Request: Field 'code' in the request doesn't contain a subclass implementation of 'opik.evaluation.metrics.BaseMetric'" - - -def test_evaluation_exception_returns_bad_request(client): - response = client.post(EVALUATORS_URL, json={ - "data": DATA, - "code": SCORE_EXCEPTION_METRIC + "code": code }) assert response.status_code == 400 - assert response.json["error"] == "400 Bad Request: The provided 'code' and 'data' fields can't be evaluated" + assert response.json["error"] == "400 Bad Request: Execution failed: Python code contains an invalid metric" def test_no_scores_returns_bad_request(client): @@ -244,3 +277,5 @@ def test_no_scores_returns_bad_request(client): assert response.status_code == 400 assert response.json[ "error"] == "400 Bad Request: The provided 'code' field didn't return any 'opik.evaluation.metrics.ScoreResult'" + +# TODO: Add test cases: timeout, networking etc. diff --git a/apps/opik-python-backend/tests/test_requirements.txt b/apps/opik-python-backend/tests/test_requirements.txt index d197ada2ff..26cb8c0ae0 100644 --- a/apps/opik-python-backend/tests/test_requirements.txt +++ b/apps/opik-python-backend/tests/test_requirements.txt @@ -1 +1,3 @@ +iniconfig==2.0.0 +pluggy==1.5.0 pytest==8.3.4