diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f9588c3c..63d96aef 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,8 +39,8 @@ jobs: matrix: runner: - s3-benchrunner-c - - s3-benchrunner-cli - s3-benchrunner-crt-java + - s3-benchrunner-python steps: - uses: actions/checkout@v4 diff --git a/runners/s3-benchrunner-python/README.md b/runners/s3-benchrunner-python/README.md index d39d3913..7d9175ef 100644 --- a/runners/s3-benchrunner-python/README.md +++ b/runners/s3-benchrunner-python/README.md @@ -1,11 +1,12 @@ # s3-benchrunner-cli ``` -usage: benchrunner.py [-h] [--verbose] [--use-existing-aws-config] BENCHMARK BUCKET REGION TARGET_THROUGHPUT +usage: benchrunner.py [-h] [--verbose] {crt,boto3-python,cli-python,cli-crt} BENCHMARK BUCKET REGION TARGET_THROUGHPUT -Benchmark runner for AWS CLI +Python benchmark runner. Pick which S3 library to use. positional arguments: + {crt,boto3-python,cli-python,cli-crt} BENCHMARK BUCKET REGION @@ -13,18 +14,36 @@ positional arguments: optional arguments: -h, --help show this help message and exit - --verbose Show CLI commands and their output - --use-existing-aws-config - If set, your existing AWS_CONFIG_FILE is used. (instead of one that customizes - 'preferred_transfer_client') + --verbose ``` -This runner uses your existing `aws` CLI installation. -If you want to build the CLI yourself, see [instructions below](#building-locally). +This is the runner for python libraries. Pass which library you want to benchmark: +* `crt`: Uses the [aws-crt-python](https://github.com/awslabs/aws-crt-python/) (the CRT bindings for python) directly. +* `boto3-python`: Uses [boto3](https://github.com/boto/boto3), with pure-python transfer manager. +* ~~boto3-crt: Uses boto3, with CRT transfer manager.~~ COMING SOON +* `cli-python`: Uses [AWS CLI](https://github.com/aws/aws-cli/), with pure-python transfer manager. +* `cli-crt`: Uses AWS CLI, with CRT transfer manager. -This runner skips benchmarks unless it can do them in a single AWS CLI command. +See [installation instructions](#installation) before running. + +### How this works with aws-crt-python + +When using aws-crt-python (async API), all tasks are kicked off from the main thread, +then it waits for all tasks to finish. The CRT maintains its own thread pool +(EventLoopGroup) where the actual work is done. + +### How this works with boto3 + +When using boto3 (synchronous API), a [ThreadPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor) +is used to run all tasks. This is significantly faster than running tasks one +after another on the main thread. I'm not sure if this is how Python programmers +would naturally do things, but it's simple enough to recommend given the huge payoff. + +### How this works with AWS CLI + +When using AWS CLI, this runner skips benchmarks unless it can do them in a single command. If we used multiple commands, one after another, performance would look bad -compared to other runners that run multiple commands in parallel. +compared to other libraries that run multiple commands in parallel. That's not a fair comparison (no one runs CLI commands in parallel) so we skip those benchmarks. Here are examples, showing how a given benchmark is run in a single CLI command: @@ -45,37 +64,69 @@ Here are examples, showing how a given benchmark is run in a single CLI command: * benchmark: `upload-5GiB-ram` * cmd: `<5GiB_random_data> | aws s3 cp - s3://my-s3-benchmarks/upload/5GiB/1` +# Installation + +## Quick install + +To test against the most recent public releases of these libraries: + +First, install the CLI if you don't already have it: +https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html + +Then install boto3 with CRT: +```sh +python3 -m pip install --upgrade "boto3[crt]" +``` + ## Building locally -Here are instructions to use a locally built AWS CLI. +To test against in-development of these libraries, we'll install from source: First, create a virtual environment, to isolate your dev versions from system defaults: ```sh python3 -m venv .venv +source .venv/bin/activate +``` + +Now make a build dir somewhere. +We're going to pull the source code for dependencies and install them... +``` +(.venv) cd path/to/my/build/dir ``` -Now we'll use python in the virtual environment... -Install some dependencies... +First, AWS CLI (`--editable` so we can modify its source without reinstalling): +```sh +(.venv) git clone --branch v2 https://github.com/aws/aws-cli.git +(.venv) python3 -m pip install --editable aws-cli ``` -.venv/bin/python3 -m pip install --upgrade pip boto3 + +Next boto3: +```sh +(.venv) git clone https://github.com/boto/boto3.git +(.venv) python3 -m pip install --editable boto3 +``` + +Next s3transfer: +```sh +(.venv) git clone https://github.com/boto/s3transfer.git +(.venv) python3 -m pip install --editable s3transfer ``` -Next, pull the AWS CLI source code and install it in your virtual environment -(`--editable` so we can modify its source without reinstalling): +Next botocore: ```sh -git clone --branch v2 https://github.com/aws/aws-cli.git -.venv/bin/python3 -m pip install --editable aws-cli +(.venv) git clone https://github.com/boto/botocore.git +(.venv) python3 -m pip install --editable botocore ``` -And if you want the latest aws-crt-python, pull it and install that too: +Finally aws-crt-python: ```sh -git clone --recurse-submodules https://github.com/awslabs/aws-crt-python.git -.venv/bin/python3 -m pip install --editable aws-crt-python +(.venv) git clone --recurse-submodules https://github.com/awslabs/aws-crt-python.git +(.venv) python3 -m pip install --editable aws-crt-python ``` pip complains that the newly installed 1.0.0.dev0 clashes -with the version requirements from awscli, but we ignore this. +with the version requirements from other packages, but we ignore this. -Now, you can execute the runner using your virtual environment with the latest CLI and CRT: +Now, you can execute the runner using your virtual environment with all the latest code: ```sh -.venv/bin/python3 path/to/aws-crt-s3-benchmarks/runners/s3-benchrunner-cli/benchrunner.py --help +(.venv) python3 path/to/aws-crt-s3-benchmarks/runners/s3-benchrunner-python/benchrunner.py --help ``` diff --git a/runners/s3-benchrunner-python/benchrunner.py b/runners/s3-benchrunner-python/benchrunner.py index e3193d9b..99fac04b 100755 --- a/runners/s3-benchrunner-python/benchrunner.py +++ b/runners/s3-benchrunner-python/benchrunner.py @@ -9,19 +9,24 @@ from dataclasses import dataclass import io import json +import os import os.path from pathlib import Path +import subprocess import sys +import tempfile import time from typing import List, Optional, Tuple PARSER = argparse.ArgumentParser( - description='Benchmark runner for python libs') -PARSER.add_argument('LIB', choices=('crt', 'boto3')) + description='Python benchmark runner. Pick which S3 library to use.') +PARSER.add_argument('LIB', choices=( + 'crt', 'boto3-python', 'cli-python', 'cli-crt')) PARSER.add_argument('BENCHMARK') PARSER.add_argument('BUCKET') PARSER.add_argument('REGION') PARSER.add_argument('TARGET_THROUGHPUT', type=float) +PARSER.add_argument('--verbose', action='store_true') def exit_with_skip_code(msg: str): @@ -29,6 +34,11 @@ def exit_with_skip_code(msg: str): exit(123) +def exit_with_error(msg: str): + print(f'FAIL - {msg}', file=sys.stderr) + exit(255) + + def ns_to_secs(ns: int) -> float: return ns / 1_000_000_000.0 @@ -59,15 +69,20 @@ class TaskConfig: @dataclass class BenchmarkConfig: - """Benchmark config, loaded from JSON""" + """Benchmark config""" + # loaded from json... files_on_disk: bool checksum: str max_repeat_count: int max_repeat_secs: int tasks: list[TaskConfig] + # passed on cmdline... + bucket: str + region: str + target_throughput_Gbps: float - @staticmethod - def from_json(benchmark_path: Path) -> 'BenchmarkConfig': + def __init__(self, benchmark_path: str, bucket: str, region: str, + target_throughput_Gbps: float, verbose: bool): with open(benchmark_path) as f: benchmark = json.load(f) @@ -75,14 +90,17 @@ def from_json(benchmark_path: Path) -> 'BenchmarkConfig': if version != 2: exit_with_skip_code(f'benchmark version not supported: {version}') - files_on_disk = benchmark['filesOnDisk'] - checksum = benchmark['checksum'] - max_repeat_count = benchmark['maxRepeatCount'] - max_repeat_secs = benchmark['maxRepeatSecs'] - tasks = [TaskConfig(task['action'], task['key'], task['size']) - for task in benchmark['tasks']] + self.files_on_disk = benchmark['filesOnDisk'] + self.checksum = benchmark['checksum'] + self.max_repeat_count = benchmark['maxRepeatCount'] + self.max_repeat_secs = benchmark['maxRepeatSecs'] + self.tasks = [TaskConfig(task['action'], task['key'], task['size']) + for task in benchmark['tasks']] - return BenchmarkConfig(files_on_disk, checksum, max_repeat_count, max_repeat_secs, tasks) + self.bucket = bucket + self.region = region + self.target_throughput_Gbps = target_throughput_Gbps + self.verbose = verbose def bytes_per_run(self) -> int: return sum([task.size for task in self.tasks]) @@ -91,12 +109,8 @@ def bytes_per_run(self) -> int: class Benchmark: """Base class for runnable benchmark""" - def __init__(self, config: BenchmarkConfig, bucket: str, region: str, - target_throughput_Gbps: float): + def __init__(self, config: BenchmarkConfig): self.config = config - self.bucket = bucket - self.region = region - self.target_throughput_Gbps = target_throughput_Gbps # If we're uploading, and not using files on disk, # then generate an in-memory buffer of random data to upload. @@ -108,6 +122,18 @@ def __init__(self, config: BenchmarkConfig, bucket: str, region: str, largest_upload = task.size self._random_data_for_upload = os.urandom(largest_upload) + @staticmethod + def create_runner_for_lib(lib: str, config: BenchmarkConfig) -> 'Benchmark': + """Factory function. Create appropriate subclass, given the lib.""" + if lib == 'crt': + return CrtBenchmark(config) + if lib == 'boto3-python': + return Boto3Benchmark(config) + if lib.startswith('cli-'): + return CliBenchmark(config, use_crt=lib.endswith('crt')) + else: + raise ValueError(f'Unknown lib: {lib}') + def run(self): raise NotImplementedError() @@ -122,8 +148,8 @@ def _new_iostream_to_upload_from_ram(self, size: int) -> io.BytesIO: class CrtBenchmark(Benchmark): """Runnable benchmark using aws-crt-python's S3Client""" - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, config: BenchmarkConfig): + super().__init__(config) elg = awscrt.io.EventLoopGroup(cpu_group=0) resolver = awscrt.io.DefaultHostResolver(elg) @@ -132,14 +158,14 @@ def __init__(self, *args, **kwargs): bootstrap) signing_config = awscrt.s3.create_default_s3_signing_config( - region=self.region, + region=self.config.region, credential_provider=credential_provider) self._s3_client = awscrt.s3.S3Client( bootstrap=bootstrap, - region=self.region, + region=self.config.region, signing_config=signing_config, - throughput_target_gbps=self.target_throughput_Gbps) + throughput_target_gbps=self.config.target_throughput_Gbps) def run(self): # kick off all tasks @@ -154,7 +180,8 @@ def _make_request(self, task_i) -> Future: task = self.config.tasks[task_i] headers = awscrt.http.HttpHeaders() - headers.add('Host', f'{self.bucket}.s3.{self.region}.amazonaws.com') + headers.add( + 'Host', f'{self.config.bucket}.s3.{self.config.region}.amazonaws.com') path = f'/{task.key}' send_stream = None # if uploading from ram send_filepath = None # if uploading from disk @@ -167,8 +194,12 @@ def _make_request(self, task_i) -> Future: headers.add('Content-Type', 'application/octet-stream') if self.config.files_on_disk: + if self.config.verbose: + print(f'aws-crt-python upload from disk: {task.key}') send_filepath = task.key else: + if self.config.verbose: + print(f'aws-crt-python upload from RAM: {task.key}') send_stream = self._new_iostream_to_upload_from_ram(task.size) elif task.action == 'download': @@ -177,7 +208,12 @@ def _make_request(self, task_i) -> Future: headers.add('Content-Length', '0') if self.config.files_on_disk: + if self.config.verbose: + print(f'aws-crt-python download to disk: {task.key}') recv_filepath = task.key + else: + if self.config.verbose: + print(f'aws-crt-python download to RAM: {task.key}') # completion callback sets the future as complete, # or exits the program on error @@ -208,8 +244,10 @@ def on_done(error: Optional[Exception], class Boto3Benchmark(Benchmark): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + """Runnable benchmark using boto3.client('s3')""" + + def __init__(self, config: BenchmarkConfig): + super().__init__(config) self._s3_client = boto3.client('s3') @@ -218,22 +256,32 @@ def _make_request(self, task_i: int): if task.action == 'upload': if self.config.files_on_disk: - self._s3_client.upload_file(task.key, self.bucket, task.key) + if self.config.verbose: + print(f'boto3 upload_file("{task.key}")') + self._s3_client.upload_file( + task.key, self.config.bucket, task.key) else: + if self.config.verbose: + print(f'boto3 upload_fileobj("{task.key}")') upload_stream = self._new_iostream_to_upload_from_ram( task.size) self._s3_client.upload_fileobj( - upload_stream, self.bucket, task.key) + upload_stream, self.config.bucket, task.key) elif task.action == 'download': if self.config.files_on_disk: - self._s3_client.download_file(self.bucket, task.key, task.key) + if self.config.verbose: + print(f'boto3 download_file("{task.key}")') + self._s3_client.download_file( + self.config.bucket, task.key, task.key) else: + if self.config.verbose: + print(f'boto3 download_fileobj("{task.key}")') download_stream = Boto3DownloadFileObj() self._s3_client.download_fileobj( - self.bucket, task.key, download_stream) + self.config.bucket, task.key, download_stream) else: raise RuntimeError(f'Unknown action: {task.action}') @@ -258,14 +306,184 @@ def write(self, b): pass +class CliBenchmark(Benchmark): + def __init__(self, config: BenchmarkConfig, use_crt: bool): + super().__init__(config) + self.use_crt = use_crt + + # Write out temp AWS CLI config file, so it uses CRT or not. + # https://awscli.amazonaws.com/v2/documentation/api/latest/topic/s3-config.html + self._cli_config_file = tempfile.NamedTemporaryFile(prefix='awsconfig') + config_text = self._derive_cli_config() + self._cli_config_file.write(config_text.encode()) + self._cli_config_file.flush() + + os.environ['AWS_CONFIG_FILE'] = self._cli_config_file.name + + if self.config.verbose: + print(f'--- AWS_CONFIG_FILE ---') + print(config_text) + + self._cli_cmd, self._stdin_for_cli = self._derive_cli_cmd() + + def _derive_cli_config(self) -> str: + lines = ['[default]', + 's3 ='] + if self.use_crt: + lines += [' preferred_transfer_client = crt', + f' target_throughput = {self.config.target_throughput_Gbps} Gb/s'] + else: + lines += [' preferred_transfer_client = default'] + + lines += [''] # blank line at end of file + return '\n'.join(lines) + + def _derive_cli_cmd(self) -> Tuple[list[str], Optional[bytes]]: + """ + Figures out single CLI command that will do everything in the benchmark. + Exits with skip code if we can't do this benchmark in one CLI command. + + Returns (list_of_cli_args, optional_stdin_for_cli) + """ + num_tasks = len(self.config.tasks) + first_task = self.config.tasks[0] + + cmd = ['aws', 's3', 'cp'] + stdin: Optional[bytes] = None + + if num_tasks == 1: + # doing 1 file is simple, just name the src and dst + if first_task.action == 'download': + # src + cmd.append(f's3://{self.config.bucket}/{first_task.key}') + # dst + if self.config.files_on_disk: + cmd.append(first_task.key) + else: + cmd.append('-') # print file to stdout + + else: # upload + # src + if self.config.files_on_disk: + cmd.append(first_task.key) + else: + cmd.append('-') # read file from stdin + + stdin = self._random_data_for_upload + + # dst + cmd.append(f's3://{self.config.bucket}/{first_task.key}') + else: + # For CLI to do multiple files, we need to cp a directory + first_task_dir = os.path.split(first_task.key)[0] + + # Check that we can do all files in one cmd + for task_i in self.config.tasks[1:]: + if first_task_dir != os.path.split(task_i.key)[0]: + exit_with_skip_code( + 'CLI cannot run benchmark unless all keys are in the same directory') + + if first_task.action != task_i.action: + exit_with_skip_code( + 'CLI cannot run benchmark unless all actions are the same') + + if first_task.key == task_i.key: + exit_with_skip_code( + 'CLI cannot run benchmark that uses same key multiple times') + + if not self.config.files_on_disk: + exit_with_skip_code( + "CLI cannot run benchmark with multiple files unless they're on disk") + + # Add src and dst + if first_task.action == 'download': + # src + cmd.append(f's3://{self.config.bucket}/{first_task_dir}') + # dst + cmd.append(first_task_dir) + else: # upload + # src + cmd.append(first_task_dir) + # dst + cmd.append(f's3://{self.config.bucket}/{first_task_dir}') + + # Need --recursive to do multiple files + cmd.append('--recursive') + + # If not using all files in dir, --exclude "*" and then --include the ones we want. + if not self._using_all_files_in_dir(first_task.action, first_task_dir): + cmd += ['--exclude', '*'] + for task in self.config.tasks: + cmd += ['--include', os.path.split(task.key)[1]] + + # Add common options, used by all commands + cmd += ['--region', self.config.region] + + # As of Sept 2023, can't pick checksum for: aws s3 cp + if self.config.checksum: + exit_with_skip_code( + "CLI cannot run benchmark with specific checksum algorithm") + + return cmd, stdin + + def _using_all_files_in_dir(self, action: str, prefix: str) -> bool: + """ + Return True if benchmark uploads all files in dir, or downloads objects at S3 prefix. + Returns False if there are files that should be ignored. + """ + all_task_keys = {task.key for task in self.config.tasks} + + if action == 'download': + # Check all S3 objects at this prefix + s3 = boto3.client('s3', region_name=self.config.region) + + # list_objects_v2() is paginated, call in loop until we have all the data + s3 = boto3.client('s3') + paginator = s3.get_paginator('list_objects_v2') + for page in paginator.paginate(Bucket=self.config.bucket, Prefix=prefix): + for obj in page['Contents']: + if not obj['Key'] in all_task_keys: + return False + + else: # upload + # Check all files in this local dir + for child_i in Path(prefix).iterdir(): + if child_i.is_file(): + child_str = str(child_i) + if not child_str in all_task_keys: + return False + + return True + + def run(self): + run_kwargs = {'args': self._cli_cmd, + 'input': self._stdin_for_cli} + if self.config.verbose: + # show live output, and immediately raise exception if process fails + print(f'> {subprocess.list2cmdline(self._cli_cmd)}', flush=True) + run_kwargs['check'] = True + else: + # capture output, and only print if there's an error + run_kwargs['capture_output'] = True + + result = subprocess.run(**run_kwargs) + if result.returncode != 0: + # show command that failed, and stderr if any + errmsg = f'{subprocess.list2cmdline(self._cli_cmd)}' + stderr = result.stderr.decode().strip() + if stderr: + errmsg += f'\n{stderr}' + exit_with_error(errmsg) + + if __name__ == '__main__': args = PARSER.parse_args() - config = BenchmarkConfig.from_json(Path(args.BENCHMARK)) + config = BenchmarkConfig(args.BENCHMARK, args.BUCKET, args.REGION, + args.TARGET_THROUGHPUT, args.verbose) + + # create appropriate benchmark runner for given library + benchmark = Benchmark.create_runner_for_lib(args.LIB, config) - # Create CrtBenchmark or Boto3Benchmark - benchmark_class = CrtBenchmark if args.LIB == 'crt' else Boto3Benchmark - benchmark = benchmark_class(config, args.BUCKET, - args.REGION, args.TARGET_THROUGHPUT) bytes_per_run = config.bytes_per_run() # Repeat benchmark until we exceed max_repeat_count or max_repeat_secs