diff --git a/.gitignore b/.gitignore index 0566872e8..9684f76ea 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ docs/assets docs/repos/ logs/ +models-perf/ output/ site venv/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8626ad652..ae383ac20 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -72,12 +72,12 @@ repos: language: system name: pylint types: [python] - - entry: bash -c "python -m tox -e py310" + - entry: bash -c "python -m tox -e py310,clean" files: ^test-runner/ id: tox language: system name: tox - - entry: bash -c "mkdocs build --clean" + - entry: bash -c "rm -rf site/ && mkdocs build --clean" # files: ^docs/ id: mkdocs language: system diff --git a/test-runner/README.md b/test-runner/README.md index 2a9c2b9b0..096eb97c6 100644 --- a/test-runner/README.md +++ b/test-runner/README.md @@ -30,6 +30,7 @@ A test is defined as a set of commands to be executed along with their associate | [volumes](https://github.com/compose-spec/compose-spec/blob/master/spec.md#volumes) | Optional[List[[Volume](utils/test.py#L13)]] | A list of volumes to be mounted when running the test in a container. | | [env](https://github.com/compose-spec/compose-spec/blob/master/spec.md#environment) | Optional[Dict[str, str]] | A list of environment variables to be set when the test is running. | | mask | Optional[List[str]] | A list of keys to [mask](#masking) in the test output. | +| performance | Optional[str] | Check test performance thresholds in the format `perf/path/to/model.yaml:test-id` | | notebook | Optional[str] | A flag indicating whether the test utilizes a [jupyter notebook](#notebook-test). | | serving | Optional[str] | A flag indicating whether a [serving test](#serving-test) should be invoked. | | [cap_add](https://github.com/compose-spec/compose-spec/blob/master/spec.md#cap_add) | Optional[str] | Specifies additional container capabilities. | @@ -75,12 +76,12 @@ In the example above, the first output will be `hello`, and the second output wi Masking is a feature that allows you to hide sensitive information in the logs generated by the test runner. This is useful when you want to prevent benchmark information from being publicly exposed. -To enable masking, add the `mask` parameter to your `tests.yaml` file as a list of strings. Each string should be a key whose value you want to mask without any kind of delimiter. +To enbable masking, add the `mask` parameter to your `tests.yaml` file as a list of strings. Each string should be a key whose value you want to mask without any kind of delimiter. -By default, masking is not enabled. To enable masking, use the `-m` flag when running the test runner application. +By default, masking is enabled. To disable masking, add `"mask": [false]` to your `.actions.json` file. ```bash -python -m -f path/to/tests.yaml +python -f path/to/tests.yaml ``` ```bash @@ -92,6 +93,21 @@ test: In the example above, the output will be `hello:***` +#### Performance Thresholds + +You can utilize performance thresholds stored in another github repository by providing the `PERF_REPO` environment variable in GitHub's `org-name/repo-name` format. + +```yaml +test: + cmd: "echo 'my-key: 100'" + performance: perf/my-model:my-test-id +``` + +```bash +export PERF_REPO=... +python test-runner/test_runner.py -f path/to/tests.yaml +``` + #### Notebook Test A notebook test is a special type of test designed to run Jupyter notebooks. This is indicated by setting the notebook attribute to `True` in the test definition. When a test is marked as a notebook test, the command specified in the cmd attribute is expected to be [papermill](https://github.com/nteract/papermill) command. If papermill is not already installed in the provided `image` property, then it will be installed. @@ -139,7 +155,7 @@ For more options, see the `--help` output below: ```text $ python test_runner.py --help -usage: test_runner.py [-h] [-a ACTIONS_PATH] -f FILE_PATH [-v] [-l LOGS_PATH] [-m] +usage: test_runner.py [-h] [-a ACTIONS_PATH] -f FILE_PATH [-v] [-l LOGS_PATH] optional arguments: -h, --help show this help message and exit @@ -150,7 +166,6 @@ optional arguments: -v, --verbose DEBUG Loglevel -l LOGS_PATH, --logs LOGS_PATH -l /path/to/logs - -m, --mask Enable mask parameter for sensitive information in logs ``` ### Run Modes diff --git a/test-runner/dev-requirements.txt b/test-runner/dev-requirements.txt index 1b6b198f2..4e061da2c 100644 --- a/test-runner/dev-requirements.txt +++ b/test-runner/dev-requirements.txt @@ -1,7 +1,9 @@ black>=24.4.1 coverage>=7.5.0 expandvars>=0.12.0 +gitpython>=3.1.43 hypothesis>=6.100.1 +Pint>=0.21.1 pydantic==2.7.2 pylint>=3.1.0 pytest>=8.1.1 diff --git a/test-runner/requirements.txt b/test-runner/requirements.txt index e76d70fc6..a0792e3ef 100644 --- a/test-runner/requirements.txt +++ b/test-runner/requirements.txt @@ -1,4 +1,6 @@ expandvars>=0.12.0 +gitpython>=3.1.43 +Pint>=0.21.1 pydantic==2.7.2 python_on_whales>=0.70.1 pyyaml>=6.0.1 diff --git a/test-runner/test_runner.py b/test-runner/test_runner.py index fd4e14743..4dc9c1b27 100644 --- a/test-runner/test_runner.py +++ b/test-runner/test_runner.py @@ -38,7 +38,7 @@ from expandvars import expandvars from python_on_whales import DockerException, docker from tabulate import tabulate -from utils.test import Test +from utils.test import PerfException, Test from yaml import YAMLError, full_load @@ -187,7 +187,7 @@ def get_test_list(args: dict, tests_yaml: List[dict]): # returns the stdout of the test and the RETURNCODE try: # Try for Runtime Failure Conditions log = test.container_run() if test.img else test.run() - except DockerException as err: + except (DockerException, PerfException, YAMLError) as err: logging.error(err) summary.append([idx + 1, test.name, "FAIL"]) ERROR = True diff --git a/test-runner/tests.yaml b/test-runner/tests.yaml index 26e18be2d..1e37c9a5c 100644 --- a/test-runner/tests.yaml +++ b/test-runner/tests.yaml @@ -15,17 +15,17 @@ test1: img: ${REGISTRY}/${REPO}:latest # substitute env from host cmd: head -n 1 /workspace/test-runner/requirements.txt # volume mounted file -# device: /dev/dri -# ipc: host + # device: /dev/dri + # ipc: host notebook: True env: REGISTRY: ${REGISTRY} # substitute env from host DEBUG: 'true' # single quotes volumes: - - src: /tf_dataset - dst: /tmp - - src: $PWD - dst: /workspace + - src: /tf_dataset + dst: /tmp + - src: $PWD + dst: /workspace test2: cmd: echo -n $TEST && python -c 'print(" World", end="")' # var substitution inline env: @@ -41,8 +41,13 @@ test6: img: ${CACHE_REGISTRY}/cache/library/python:3.11-slim-bullseye cmd: "echo 'hello: world'" mask: - - hello + - hello test7: cmd: "echo 'world: hello'" mask: - - world + - world +test8: + cmd: "echo 'test: 123 throughput'" + mask: + - test + performance: perf/test.yaml:test diff --git a/test-runner/tests/utest.py b/test-runner/tests/utest.py index 5b02911a5..af8c75f04 100644 --- a/test-runner/tests/utest.py +++ b/test-runner/tests/utest.py @@ -21,7 +21,7 @@ from hypothesis import given from hypothesis.strategies import dictionaries, text from test_runner import get_test_list, parse_args, set_log_filename -from utils.test import Test +from utils.test import PerfException, Test @pytest.fixture @@ -143,6 +143,11 @@ def test_get_test_list(test_args_input, test_json_input): "mask": ["hello"], }, "test7": {"cmd": "echo 'world: hello'", "mask": ["world"]}, + "test8": { + "cmd": "echo 'test: 123 throughput'", + "mask": ["test"], + "performance": "perf/test.yaml:test", + }, } test_fn, disable_masking = get_test_list(test_args_input, test_json_input) @@ -154,9 +159,47 @@ def test_masking(test_class_input): "test masking." for test in test_class_input: if test.mask != [] and test.img: - assert ":***" in test.container_run() + assert ": ***" in test.container_run() if test.mask != [] and not test.img: - assert ":***" in test.run() + assert ": ***" in test.run() + + +def test_perf_thresholds(): + "test performance thresholds." + test_cases = [ + { + "cmd": "echo 'test: 123 throughput'", + "performance": "perf/test.yaml:test", + "expected_output": "test: 123 throughput", + "should_raise_exception": False, + }, + { + "cmd": "echo 'test: 121 throughput'", + "performance": "perf/test.yaml:test", + "should_raise_exception": True, + }, + { + "cmd": "echo 'test: 123 millithroughput'", + "performance": "perf/test.yaml:test", + "should_raise_exception": True, + }, + { + "cmd": "echo 'test: 125 throughput'", + "performance": "perf/test.yaml:not-test", + "should_raise_exception": True, + }, + ] + + for test_case in test_cases: + test = Test(name="test", **test_case) + if test_case["should_raise_exception"]: + try: + with pytest.raises(Exception, match="Failed") as exc_info: + test.run() + except: + assert isinstance(exc_info.value, PerfException) + else: + assert test_case["expected_output"] in test.run() @given(name=text(), arguments=dictionaries(text(), text())) diff --git a/test-runner/utils/test.py b/test-runner/utils/test.py index 9f3eb6a48..72f41efbd 100644 --- a/test-runner/utils/test.py +++ b/test-runner/utils/test.py @@ -21,9 +21,27 @@ from subprocess import PIPE, Popen from typing import Dict, List, Optional +import pint from expandvars import expandvars +from git import Repo from pydantic import BaseModel from python_on_whales import DockerException, docker +from yaml import YAMLError, full_load + +units = pint.UnitRegistry() + + +class PerfException(Exception): + "Constructs a PerfException class." + + +class Threshold(BaseModel): + "Constructs a Threshold class." + name: str + modelName: str + boundary: float + lower_is_better: bool + unit: str class Volume(BaseModel): @@ -49,12 +67,28 @@ class Test(BaseModel): groups_add: Optional[List[str]] = ["109", "44"] hostname: Optional[str] = None ipc: Optional[str] = None + performance: Optional[str] = None privileged: Optional[bool] = False pull: Optional[str] = "missing" user: Optional[str] = None shm_size: Optional[str] = None workdir: Optional[str] = None + def __init__(self, **data): + super().__init__(**data) + if self.performance: + perf_repo = os.environ.get("PERF_REPO") + if perf_repo: + if not os.path.exists("models-perf"): + Repo.clone_from( + f"https://github.com/{perf_repo}", "models-perf", progress=None + ) + else: + logging.error( + "Performance mode enabled, but PERF_REPO environment variable not set" + ) + units.load_definitions("./models-perf/definitions.txt") + def get_path(self, name): """Given a filename, find that file from the users current working directory @@ -171,6 +205,54 @@ def notebook_run(self, img: str): load=True, ) + def check_perf(self, content): + """ + Check the performance of the test against the thresholds. + + Args: + content (str): test output log + + Raises: + PerfException: if the performance does not meet the target performance + """ + with open( + f"models-perf/{self.performance.split(':')[0]}", "r", encoding="utf-8" + ) as file: + try: + thresholds = full_load(file) + except YAMLError as yaml_exc: + raise YAMLError(yaml_exc) + model_thresholds = [ + threshold + for threshold in thresholds + if self.performance.split(":")[1] == threshold["test_id"] + ] + for threshold in model_thresholds: + perf = re.search( + rf"{threshold['key']}[:]?\s+(.\d+[\s]?.*)", + content, + re.IGNORECASE, + ) + if perf: + if threshold["lower_is_better"]: + if units.Quantity(perf.group(1)) > units.Quantity( + f"{threshold['boundary']} {threshold['unit']}" + ): + if not self.mask: + logging.info("%s: %s", threshold["key"], perf.group(1)) + raise PerfException( + f"Performance Threshold {threshold['name']} did not meet the target performance." + ) + else: + if units.Quantity(perf.group(1)) < units.Quantity( + f"{threshold['boundary']} {threshold['unit']}" + ): + if not self.mask: + logging.info("%s: %s", threshold["key"], perf.group(1)) + raise PerfException( + f"Performance Threshold {threshold['name']} did not meet the target performance." + ) + def container_run(self): """Runs the docker container. @@ -235,9 +317,11 @@ def container_run(self): log = "" for _, stream_content in output_generator: # All process logs will have the stream_type of stderr despite it being stdout + if self.performance: + self.check_perf(stream_content.decode("utf-8")) for item in self.mask: stream_content = re.sub( - rf"({item}[:=-_\s])(.*)", + rf"({item}[:]?\s+)(.*)", r"\1***", stream_content.decode("utf-8"), ).encode("utf-8") @@ -271,14 +355,16 @@ def run(self): ) try: stdout, stderr = p.communicate() + if self.performance: + self.check_perf(stdout.decode("utf-8")) for item in self.mask: stdout = re.sub( - rf"({item}[:=-_\s])(.*)", r"\1***", stdout.decode("utf-8") + rf"({item}[:]?\s+)(.*)", r"\1***", stdout.decode("utf-8") ).encode("utf-8") if stderr: - logging.error(stderr.decode("utf-8")) + logging.error(stderr.decode("utf-8").strip()) if stdout: - logging.info("Test Output: %s", stdout.decode("utf-8")) + logging.info("Test Output: %s", stdout.decode("utf-8").strip()) return stdout.decode("utf-8") except KeyboardInterrupt: os.killpg(os.getpgid(p.pid), SIGKILL) diff --git a/tox.ini b/tox.ini index eec5baa23..66af473bf 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,7 @@ passenv = DOCKER_* setenv = CACHE_REGISTRY = {env:CACHE_REGISTRY} PATH = {env:PATH}:/usr/local/bin/docker + PERF_REPO = {env:PERF_REPO} PWD = {env:PWD} REGISTRY = {env:REGISTRY} REPO = {env:REPO} @@ -52,3 +53,9 @@ python = 3.11: py311 3.12: py312 parallel_show_output = true + +[testenv:clean] +allowlist_externals=/bin/bash +commands = + /bin/bash -c "rm -rf .coverage* models-perf" +ignore_errors = True